import scrapy
|
import json
|
import requests
|
from datetime import datetime
|
from urllib.parse import urljoin
|
|
|
class DoubanTVSpider(scrapy.Spider):
|
name = 'douban_tv'
|
allowed_domains = ['movie.douban.com']
|
|
# 使用start()替代已弃用的start_requests()
|
def start_requests(self):
|
headers = {
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/112.0.0.0 Safari/537.36',
|
'Referer': 'https://movie.douban.com/tv/'
|
}
|
# 同时爬取电影和电视剧
|
# urls = [
|
# "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电视剧&start=0",
|
# "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电影&start=0"
|
# ]
|
# 路径2最新电视剧
|
urls = [
|
"https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&start=0"
|
]
|
for url in urls:
|
yield scrapy.Request(url, headers=headers, callback=self.parse)
|
|
def parse(self, response):
|
try:
|
data = json.loads(response.text)
|
self.logger.info(f"解析到的数据: {data}") # 添加日志
|
except json.JSONDecodeError:
|
self.logger.error("JSON解析失败: %s", response.url)
|
return
|
|
# 检查数据结构
|
items = data.get('subjects', []) # 新API使用subjects字段
|
if not items:
|
self.logger.error("未找到subjects数据")
|
return
|
|
for item in items:
|
detail_url = item.get('url')
|
if detail_url:
|
yield scrapy.Request(
|
detail_url,
|
callback=self.parse_detail,
|
meta={'item': item}
|
)
|
|
# 分页逻辑
|
current_start = int(response.url.split('start=')[-1])
|
next_start = current_start + 20
|
if next_start < 60:
|
next_url = response.url.replace(f'start={current_start}', f'start={next_start}')
|
yield response.follow(next_url, callback=self.parse)
|
|
def parse_detail(self, response):
|
item = response.meta['item']
|
|
# 从 JSON 中提取了 url 字段(格式: https://movie.douban.com/subject/25754848/),用于后续请求。
|
|
# 获取详细信息
|
title = item.get('title', '').strip()
|
rate = item.get('rate', '')
|
year = item.get('date', '')[:4] if item.get('date') else ''
|
|
# 从详情页获取更多信息
|
info = response.css('#info').get('')
|
|
# 获取导演
|
directors = response.css('#info span:contains("导演") + .attrs a::text').getall()
|
director = ','.join(directors)
|
|
# 获取演员
|
# 使用相邻兄弟选择器定位
|
actor = response.css('#info span:contains("主演") + .attrs a::text').getall()
|
actors = ','.join(actor)
|
|
# 获取类型
|
type_text = ','.join(response.css('#info span[property="v:genre"]::text').getall())
|
|
# 获取制片方
|
producer = response.xpath('//span[contains(text(), "制片国家/地区")]/following::text()[1]').get().strip()
|
|
# 获取剧情简介
|
# 同时提取可见和隐藏内容
|
# 提取简介(最终方案)
|
synopsises = response.xpath('//span[@property="v:summary"]//text()').getall()
|
synopsis = ' '.join(''.join(synopsises).strip().split())
|
|
# 提取首播年份
|
# release_date = response.css('span[property="v:initialReleaseDate"]::text').get()
|
# year = release_date.split('-')[0] if release_date else "未知"
|
|
# 提取年份(最优方案)
|
year= response.css('.year::text').re_first(r'(\d+)') # 直接匹配数字
|
|
# 如果正则失效的备选方案
|
if not year:
|
year_text = response.css('.year::text').get()
|
year = year_text.strip('()') if year_text else "未知"
|
|
# 获取封面图片
|
cover_url = response.css('#mainpic img::attr(src)').get('')
|
cover_alt = response.css('#mainpic img::attr(alt)').get('')
|
|
# 构建API请求数据
|
api_data = {
|
'nameCn': title,
|
'nameEn': '', # 豆瓣API没有提供英文名
|
'type': 'tv',
|
'releaseYear': year,
|
'director': director,
|
'producer': producer,
|
'actors': actors,
|
'keywords': type_text,
|
'synopsis': synopsis,
|
'coverUrl': cover_url,
|
'coverAlt': cover_alt,
|
'userType': 'official'
|
|
}
|
|
# 发送数据到API
|
try:
|
response = requests.post(
|
'http://192.168.1.213:8080/flower/api/filmWorks/new',
|
json=api_data,
|
headers={'Content-Type': 'application/json'}
|
)
|
if response.status_code == 200:
|
self.logger.info(f"成功保存作品: {title}")
|
else:
|
self.logger.error(f"保存失败: {title}, 状态码: {response.status_code}")
|
except Exception as e:
|
self.logger.error(f"API请求失败: {str(e)}")
|
|
# 添加设置(可选)
|
custom_settings = {
|
'DOWNLOAD_DELAY': 2, # 增加延迟,避免被封
|
'CONCURRENT_REQUESTS': 1, # 限制并发请求
|
'ROBOTSTXT_OBEY': False,
|
'LOG_LEVEL': 'DEBUG' # 设置日志级别为DEBUG以查看更多信息
|
}
|