# import scrapy
|
# import json
|
# import requests
|
# from datetime import datetime
|
# from urllib.parse import urljoin
|
#
|
#
|
# class DoubanTVSpider(scrapy.Spider):
|
# name = 'douban_tv'
|
# allowed_domains = ['movie.douban.com']
|
#
|
# # 使用start()替代已弃用的start_requests()
|
# def start_requests(self):
|
# headers = {
|
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/112.0.0.0 Safari/537.36'
|
# }
|
# # 同时爬取电影和电视剧
|
# urls = [
|
# "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电视剧&start=0",
|
# "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电影&start=0"
|
# ]
|
#
|
# for url in urls:
|
# yield scrapy.Request(url, headers=headers, callback=self.parse)
|
#
|
# def parse(self, response):
|
# try:
|
# data = json.loads(response.text)
|
# except json.JSONDecodeError:
|
# self.logger.error("JSON解析失败: %s", response.url)
|
# return
|
#
|
# for item in data.get('data', []):
|
# detail_url = item.get('url')
|
# if detail_url:
|
# yield scrapy.Request(
|
# detail_url,
|
# callback=self.parse_detail,
|
# meta={'item': item}
|
# )
|
#
|
# # 分页逻辑
|
# current_start = int(response.url.split('start=')[-1])
|
# next_start = current_start + 20
|
# if next_start < 60:
|
# next_url = response.url.replace(f'start={current_start}', f'start={next_start}')
|
# yield response.follow(next_url, callback=self.parse)
|
#
|
# def parse_detail(self, response):
|
# item = response.meta['item']
|
#
|
# # 获取详细信息
|
# title = item.get('title', '').strip()
|
# rate = item.get('rate', '')
|
# year = item.get('date', '')[:4] if item.get('date') else ''
|
#
|
# # 从详情页获取更多信息
|
# info = response.css('#info').get('')
|
#
|
# # 获取导演
|
# director = ','.join(response.css('#info .attrs a::text').getall()[:1])
|
#
|
# # 获取演员
|
# actors = ','.join(response.css('#info .actor .attrs a::text').getall())
|
#
|
# # 获取类型
|
# type_text = ','.join(response.css('#info span[property="v:genre"]::text').getall())
|
#
|
# # 获取制片方
|
# producer = response.css('#info span:contains("制片国家/地区:") + ::text').get('').strip()
|
#
|
# # 获取剧情简介
|
# synopsis = response.css('#link-report span[property="v:summary"]::text').get('').strip()
|
#
|
# # 获取封面图片
|
# cover_url = response.css('#mainpic img::attr(src)').get('')
|
# cover_alt = response.css('#mainpic img::attr(alt)').get('')
|
#
|
# # 构建API请求数据
|
# api_data = {
|
# 'nameCn': title,
|
# 'nameEn': '', # 豆瓣API没有提供英文名
|
# 'type': '电视剧' if '电视剧' in response.url else '电影',
|
# 'releaseYear': f"{year}-01-01T00:00:00" if year else None,
|
# 'director': director,
|
# 'producer': producer,
|
# 'actors': actors,
|
# 'keywords': type_text,
|
# 'synopsis': synopsis,
|
# 'coverUrl': cover_url,
|
# 'coverAlt': cover_alt
|
# }
|
#
|
# # 发送数据到API
|
# try:
|
# response = requests.post(
|
# 'http://192.168.1.213:8080/flower/api/filmWorks/new',
|
# json=api_data,
|
# headers={'Content-Type': 'application/json'}
|
# )
|
# if response.status_code == 200:
|
# self.logger.info(f"成功保存作品: {title}")
|
# else:
|
# self.logger.error(f"保存失败: {title}, 状态码: {response.status_code}")
|
# except Exception as e:
|
# self.logger.error(f"API请求失败: {str(e)}")
|
#
|
# # 添加设置(可选)
|
# custom_settings = {
|
# 'DOWNLOAD_DELAY': 2, # 增加延迟,避免被封
|
# 'CONCURRENT_REQUESTS': 1, # 限制并发请求
|
# 'ROBOTSTXT_OBEY': False
|
# }
|