# import scrapy # import json # import requests # from datetime import datetime # from urllib.parse import urljoin # # # class DoubanTVSpider(scrapy.Spider): # name = 'douban_tv' # allowed_domains = ['movie.douban.com'] # # # 使用start()替代已弃用的start_requests() # def start_requests(self): # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/112.0.0.0 Safari/537.36' # } # # 同时爬取电影和电视剧 # urls = [ # "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电视剧&start=0", # "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电影&start=0" # ] # # for url in urls: # yield scrapy.Request(url, headers=headers, callback=self.parse) # # def parse(self, response): # try: # data = json.loads(response.text) # except json.JSONDecodeError: # self.logger.error("JSON解析失败: %s", response.url) # return # # for item in data.get('data', []): # detail_url = item.get('url') # if detail_url: # yield scrapy.Request( # detail_url, # callback=self.parse_detail, # meta={'item': item} # ) # # # 分页逻辑 # current_start = int(response.url.split('start=')[-1]) # next_start = current_start + 20 # if next_start < 60: # next_url = response.url.replace(f'start={current_start}', f'start={next_start}') # yield response.follow(next_url, callback=self.parse) # # def parse_detail(self, response): # item = response.meta['item'] # # # 获取详细信息 # title = item.get('title', '').strip() # rate = item.get('rate', '') # year = item.get('date', '')[:4] if item.get('date') else '' # # # 从详情页获取更多信息 # info = response.css('#info').get('') # # # 获取导演 # director = ','.join(response.css('#info .attrs a::text').getall()[:1]) # # # 获取演员 # actors = ','.join(response.css('#info .actor .attrs a::text').getall()) # # # 获取类型 # type_text = ','.join(response.css('#info span[property="v:genre"]::text').getall()) # # # 获取制片方 # producer = response.css('#info span:contains("制片国家/地区:") + ::text').get('').strip() # # # 获取剧情简介 # synopsis = response.css('#link-report span[property="v:summary"]::text').get('').strip() # # # 获取封面图片 # cover_url = response.css('#mainpic img::attr(src)').get('') # cover_alt = response.css('#mainpic img::attr(alt)').get('') # # # 构建API请求数据 # api_data = { # 'nameCn': title, # 'nameEn': '', # 豆瓣API没有提供英文名 # 'type': '电视剧' if '电视剧' in response.url else '电影', # 'releaseYear': f"{year}-01-01T00:00:00" if year else None, # 'director': director, # 'producer': producer, # 'actors': actors, # 'keywords': type_text, # 'synopsis': synopsis, # 'coverUrl': cover_url, # 'coverAlt': cover_alt # } # # # 发送数据到API # try: # response = requests.post( # 'http://192.168.1.213:8080/flower/api/filmWorks/new', # json=api_data, # headers={'Content-Type': 'application/json'} # ) # if response.status_code == 200: # self.logger.info(f"成功保存作品: {title}") # else: # self.logger.error(f"保存失败: {title}, 状态码: {response.status_code}") # except Exception as e: # self.logger.error(f"API请求失败: {str(e)}") # # # 添加设置(可选) # custom_settings = { # 'DOWNLOAD_DELAY': 2, # 增加延迟,避免被封 # 'CONCURRENT_REQUESTS': 1, # 限制并发请求 # 'ROBOTSTXT_OBEY': False # }