cloudroam
6 天以前 dbdefe70570ce75d89c11224ff282db8ffe69fc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import scrapy
import json
import requests
from datetime import datetime
from urllib.parse import urljoin
 
 
class DoubanTVSpider(scrapy.Spider):
    name = 'douban_tv'
    allowed_domains = ['movie.douban.com']
 
    # 使用start()替代已弃用的start_requests()
    def start_requests(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/112.0.0.0 Safari/537.36',
            'Referer': 'https://movie.douban.com/tv/'
        }
        # 同时爬取电影和电视剧
        # urls = [
        #     "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电视剧&start=0",
        #     "https://movie.douban.com/j/new_search_subjects?sort=U&tag=电影&start=0"
        # ]
        # 路径2最新电视剧
        urls = [
            "https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&start=0"
        ]
        for url in urls:
            yield scrapy.Request(url, headers=headers, callback=self.parse)
 
    def parse(self, response):
        try:
            data = json.loads(response.text)
            self.logger.info(f"解析到的数据: {data}")  # 添加日志
        except json.JSONDecodeError:
            self.logger.error("JSON解析失败: %s", response.url)
            return
 
        # 检查数据结构
        items = data.get('subjects', [])  # 新API使用subjects字段
        if not items:
            self.logger.error("未找到subjects数据")
            return
 
        for item in items:
            detail_url = item.get('url')
            if detail_url:
                yield scrapy.Request(
                    detail_url,
                    callback=self.parse_detail,
                    meta={'item': item}
                )
 
        # 分页逻辑
        current_start = int(response.url.split('start=')[-1])
        next_start = current_start + 20
        if next_start < 60:
            next_url = response.url.replace(f'start={current_start}', f'start={next_start}')
            yield response.follow(next_url, callback=self.parse)
 
    def parse_detail(self, response):
        item = response.meta['item']
 
        # 从 JSON 中提取了 url 字段(格式: https://movie.douban.com/subject/25754848/),用于后续请求。
        
        # 获取详细信息
        title = item.get('title', '').strip()
        rate = item.get('rate', '')
        year = item.get('date', '')[:4] if item.get('date') else ''
        
        # 从详情页获取更多信息
        info = response.css('#info').get('')
        
        # 获取导演
        directors = response.css('#info span:contains("导演") + .attrs a::text').getall()
        director = ','.join(directors)
 
        # 获取演员
        # 使用相邻兄弟选择器定位
        actor = response.css('#info span:contains("主演") + .attrs a::text').getall()
        actors = ','.join(actor)
 
        # 获取类型
        type_text = ','.join(response.css('#info span[property="v:genre"]::text').getall())
        
        # 获取制片方
        producer = response.xpath('//span[contains(text(), "制片国家/地区")]/following::text()[1]').get().strip()
 
        # 获取剧情简介
        # 同时提取可见和隐藏内容
        # 提取简介(最终方案)
        synopsises = response.xpath('//span[@property="v:summary"]//text()').getall()
        synopsis =  ' '.join(''.join(synopsises).strip().split())
 
        # 提取首播年份
        # release_date = response.css('span[property="v:initialReleaseDate"]::text').get()
        # year = release_date.split('-')[0] if release_date else "未知"
 
        # 提取年份(最优方案)
        year= response.css('.year::text').re_first(r'(\d+)')  # 直接匹配数字
 
        # 如果正则失效的备选方案
        if not year:
            year_text = response.css('.year::text').get()
            year = year_text.strip('()') if year_text else "未知"
 
        # 获取封面图片
        cover_url = response.css('#mainpic img::attr(src)').get('')
        cover_alt = response.css('#mainpic img::attr(alt)').get('')
 
        # 构建API请求数据
        api_data = {
            'nameCn': title,
            'nameEn': '',  # 豆瓣API没有提供英文名
            'type': 'tv',
            'releaseYear': year,
            'director': director,
            'producer': producer,
            'actors': actors,
            'keywords': type_text,
            'synopsis': synopsis,
            'coverUrl': cover_url,
            'coverAlt': cover_alt,
            'userType': 'official'
 
        }
 
        # 发送数据到API
        try:
            response = requests.post(
                'http://192.168.1.213:8080/flower/api/filmWorks/new',
                json=api_data,
                headers={'Content-Type': 'application/json'}
            )
            if response.status_code == 200:
                self.logger.info(f"成功保存作品: {title}")
            else:
                self.logger.error(f"保存失败: {title}, 状态码: {response.status_code}")
        except Exception as e:
            self.logger.error(f"API请求失败: {str(e)}")
 
    # 添加设置(可选)
    custom_settings = {
        'DOWNLOAD_DELAY': 2,  # 增加延迟,避免被封
        'CONCURRENT_REQUESTS': 1,  # 限制并发请求
        'ROBOTSTXT_OBEY': False,
        'LOG_LEVEL': 'DEBUG'  # 设置日志级别为DEBUG以查看更多信息
    }