import scrapy import json import requests from datetime import datetime from urllib.parse import urljoin class DoubanTVSpider(scrapy.Spider): name = 'douban_tv' allowed_domains = ['movie.douban.com'] # 使用start()替代已弃用的start_requests() def start_requests(self): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/112.0.0.0 Safari/537.36', 'Referer': 'https://movie.douban.com/tv/' } # 同时爬取电影和电视剧 # urls = [ # "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电视剧&start=0", # "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电影&start=0" # ] # 路径2最新电视剧 urls = [ "https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&start=0" ] for url in urls: yield scrapy.Request(url, headers=headers, callback=self.parse) def parse(self, response): try: data = json.loads(response.text) self.logger.info(f"解析到的数据: {data}") # 添加日志 except json.JSONDecodeError: self.logger.error("JSON解析失败: %s", response.url) return # 检查数据结构 items = data.get('subjects', []) # 新API使用subjects字段 if not items: self.logger.error("未找到subjects数据") return for item in items: detail_url = item.get('url') if detail_url: yield scrapy.Request( detail_url, callback=self.parse_detail, meta={'item': item} ) # 分页逻辑 current_start = int(response.url.split('start=')[-1]) next_start = current_start + 20 if next_start < 60: next_url = response.url.replace(f'start={current_start}', f'start={next_start}') yield response.follow(next_url, callback=self.parse) def parse_detail(self, response): item = response.meta['item'] # 从 JSON 中提取了 url 字段(格式: https://movie.douban.com/subject/25754848/),用于后续请求。 # 获取详细信息 title = item.get('title', '').strip() rate = item.get('rate', '') year = item.get('date', '')[:4] if item.get('date') else '' # 从详情页获取更多信息 info = response.css('#info').get('') # 获取导演 directors = response.css('#info span:contains("导演") + .attrs a::text').getall() director = ','.join(directors) # 获取演员 # 使用相邻兄弟选择器定位 actor = response.css('#info span:contains("主演") + .attrs a::text').getall() actors = ','.join(actor) # 获取类型 type_text = ','.join(response.css('#info span[property="v:genre"]::text').getall()) # 获取制片方 producer = response.xpath('//span[contains(text(), "制片国家/地区")]/following::text()[1]').get().strip() # 获取剧情简介 # 同时提取可见和隐藏内容 # 提取简介(最终方案) synopsises = response.xpath('//span[@property="v:summary"]//text()').getall() synopsis = ' '.join(''.join(synopsises).strip().split()) # 提取首播年份 # release_date = response.css('span[property="v:initialReleaseDate"]::text').get() # year = release_date.split('-')[0] if release_date else "未知" # 提取年份(最优方案) year= response.css('.year::text').re_first(r'(\d+)') # 直接匹配数字 # 如果正则失效的备选方案 if not year: year_text = response.css('.year::text').get() year = year_text.strip('()') if year_text else "未知" # 获取封面图片 cover_url = response.css('#mainpic img::attr(src)').get('') cover_alt = response.css('#mainpic img::attr(alt)').get('') # 构建API请求数据 api_data = { 'nameCn': title, 'nameEn': '', # 豆瓣API没有提供英文名 'type': 'tv', 'releaseYear': year, 'director': director, 'producer': producer, 'actors': actors, 'keywords': type_text, 'synopsis': synopsis, 'coverUrl': cover_url, 'coverAlt': cover_alt, 'userType': 'official' } # 发送数据到API try: response = requests.post( 'http://192.168.1.213:8080/flower/api/filmWorks/new', json=api_data, headers={'Content-Type': 'application/json'} ) if response.status_code == 200: self.logger.info(f"成功保存作品: {title}") else: self.logger.error(f"保存失败: {title}, 状态码: {response.status_code}") except Exception as e: self.logger.error(f"API请求失败: {str(e)}") # 添加设置(可选) custom_settings = { 'DOWNLOAD_DELAY': 2, # 增加延迟,避免被封 'CONCURRENT_REQUESTS': 1, # 限制并发请求 'ROBOTSTXT_OBEY': False, 'LOG_LEVEL': 'DEBUG' # 设置日志级别为DEBUG以查看更多信息 }