| | |
| | | from werkzeug.exceptions import BadRequest |
| | | from ner_config import NERConfig, RepaymentNERConfig, IncomeNERConfig, FlightNERConfig, TrainNERConfig |
| | | import re |
| | | import threading |
| | | import subprocess |
| | | |
| | | from test2 import process_films |
| | | |
| | | |
| | | # 配置日志 |
| | | logging.basicConfig( |
| | |
| | | # 还款短信特征 |
| | | repayment_keywords = [ |
| | | "还款", "账单", "信用卡", "借款", "贷款", "逾期", "欠款", "最低还款", |
| | | "应还金额", "到期还款", "还清", "应还", "还款日", "账单¥", "账单¥", "查账还款" |
| | | "应还金额", "到期还款", "还清", "应还", "还款日", "账单¥", "账单¥", "查账还款", |
| | | "扣款用于", "房贷还款", "信用卡还款", "车贷还款", "应还款额", "最低还款额" |
| | | ] |
| | | |
| | | # 收入短信特征 |
| | | income_keywords = [ |
| | | "收入", "转账", "入账", "到账", "支付", "工资", "报销", "余额", |
| | | "成功收款", "收到", "款项" |
| | | "成功收款", "收到", "款项", "收入金额" |
| | | ] |
| | | |
| | | # 航班/火车票特征 |
| | |
| | | # 快递关键词 |
| | | "快递", "包裹", "取件码", "取件", "签收", "派送", "配送", |
| | | # 收入关键词 |
| | | "收入", "转账", "入账", "到账", "支付成功", "工资" |
| | | "收入", "转账", "入账", "到账", "支付成功", "工资", "支付宝转账", "微信转账" |
| | | ] |
| | | |
| | | # 先检查是否为收入短信(优先于通知判断) |
| | | income_indicators = ["收入", "入账", "转账", "工资"] |
| | | for indicator in income_indicators: |
| | | if indicator in text and "元" in text: |
| | | # 可能是收入短信,不要判断为通知 |
| | | return False |
| | | |
| | | # 运营商余额通知特征 |
| | | telecom_balance_patterns = [ |
| | |
| | | if word in code: |
| | | code = code[:code.index(word)] |
| | | |
| | | # 只保留字母、数字和连字符 |
| | | code = ''.join(c for c in code if c.isalnum() or c == "-") |
| | | # 针对不同快递公司采取不同策略 |
| | | if "丰巢" in text or "蜂巢" in text: |
| | | # 对于丰巢快递,只保留数字 |
| | | code = ''.join(c for c in code if c.isdigit()) |
| | | elif "中国邮政" in text: |
| | | # 对于中国邮政,尝试提取完整货位号 |
| | | cargo_match = re.search(r'货号[::]\s*([0-9A-Za-z\-]+)', text) |
| | | if cargo_match: |
| | | code = cargo_match.group(1) |
| | | else: |
| | | # 保留原始格式(字母、数字和连字符) |
| | | code = ''.join(c for c in code if c.isalnum() or c == "-") |
| | | else: |
| | | # 对于其他快递,保留字母、数字和连字符 |
| | | code = ''.join(c for c in code if c.isalnum() or c == "-") |
| | | |
| | | # 确保格式正确 |
| | | parts = code.split("-") |
| | | valid_parts = [] |
| | | for part in parts: |
| | | if part and any(c.isalnum() for c in part): |
| | | valid_parts.append(part) |
| | | |
| | | if valid_parts: |
| | | result["pickup_code"] = "-".join(valid_parts) |
| | | if "丰巢" in text or "蜂巢" in text: |
| | | # 对于丰巢快递,只保留数字 |
| | | if code.isdigit(): |
| | | result["pickup_code"] = code |
| | | else: |
| | | # 如果没有数字,尝试再次从文本中匹配纯数字取件码 |
| | | pickup_code_match = re.search(r'码[^0-9]*(\d+)', text) |
| | | if pickup_code_match: |
| | | result["pickup_code"] = pickup_code_match.group(1) |
| | | else: |
| | | result["pickup_code"] = None |
| | | elif "中国邮政" in text: |
| | | # 对于中国邮政,验证格式是否合理 |
| | | if re.match(r'[0-9A-Za-z\-]+', code): |
| | | result["pickup_code"] = code |
| | | else: |
| | | # 二次尝试:从文本中直接获取货位号 |
| | | cargo_match = re.search(r'货号[::]\s*([0-9A-Za-z\-]+)', text) |
| | | if cargo_match: |
| | | result["pickup_code"] = cargo_match.group(1) |
| | | else: |
| | | # 最后尝试:提取"取件密码"后的纯数字序列 |
| | | password_match = re.search(r'密码\s*(\d+)', text) |
| | | if password_match: |
| | | result["pickup_code"] = password_match.group(1) |
| | | else: |
| | | result["pickup_code"] = None |
| | | else: |
| | | result["pickup_code"] = None |
| | | # 对于其他快递,保持原有逻辑 |
| | | parts = code.split("-") |
| | | valid_parts = [] |
| | | for part in parts: |
| | | if part and any(c.isalnum() for c in part): |
| | | valid_parts.append(part) |
| | | |
| | | if valid_parts: |
| | | result["pickup_code"] = "-".join(valid_parts) |
| | | else: |
| | | result["pickup_code"] = None |
| | | |
| | | # 清理公司名称 |
| | | if result["company"]: |
| | |
| | | for word in invalid_words: |
| | | if company.endswith(word): |
| | | company = company[:-len(word)] |
| | | |
| | | # 特殊处理中国邮政 |
| | | if company == "中国" and "中国邮政" in text: |
| | | company = "邮政" |
| | | elif "中国邮政" in text and not company: |
| | | company = "邮政" |
| | | |
| | | result["company"] = company.strip() |
| | | elif "中国邮政" in text: # 如果NER未识别但文本中有中国邮政 |
| | | result["company"] = "邮政" |
| | | |
| | | # 清理地址 |
| | | if result["address"]: |
| | |
| | | |
| | | # 尝试查找最低还款金额 |
| | | min_amount_match = re.search(r'最低还款([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', context) |
| | | if min_amount_match and "MIN_CODE" in current_entity["type"]: |
| | | # 修复:确保current_entity存在且有type属性再使用 |
| | | if min_amount_match and 'current_entity' in locals() and current_entity is not None and "MIN_CODE" in current_entity["type"]: |
| | | return min_amount_match.group(1) # 直接返回匹配到的最低还款金额,保留原始格式 |
| | | |
| | | # 在上下文中查找完整金额 |
| | |
| | | # 保存原始短信内容到文件 |
| | | save_sms_to_file(text) |
| | | |
| | | # 已还清或已结清的短信模式 |
| | | already_paid_patterns = [ |
| | | r"已还清", |
| | | r"已结清", |
| | | r"还款.*?入账后.*?已还清", |
| | | r"还款.*?入账.*?结清" |
| | | ] |
| | | |
| | | # 检查是否为已还清/已结清的短信 |
| | | for pattern in already_paid_patterns: |
| | | if re.search(pattern, text): |
| | | logger.info(f"识别为已还清/已结清短信,归类为其他: {text[:30]}...") |
| | | category = "其他" |
| | | save_sms_to_file(text, category, 1.0) |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": {} |
| | | } |
| | | }) |
| | | |
| | | # 银行收入短信特征识别 |
| | | income_patterns = [ |
| | | r"收入金额[\d,.]+元", |
| | | r"账户.*?收入.*?[\d,.]+元", |
| | | r"账户.*?工资", |
| | | r"工资.*?收入", |
| | | r"入账[\d,.]+元", |
| | | # 添加新模式匹配南京银行等类似格式 |
| | | r"收入\d+\.\d+元", |
| | | r"账号.*?收入\d+\.\d+元", |
| | | r"尾号\d+的账号.*?收入\d+\.\d+元", |
| | | r"支付宝转账", |
| | | r"转账.*?收入" |
| | | ] |
| | | |
| | | # 银行还款短信特征识别 |
| | | repayment_patterns = [ |
| | | r"信用卡.*?还款", |
| | | r"账单.*?[\d,.]+元", |
| | | r"应还款额.*?[\d,.]+元", |
| | | r"最低还款额.*?[\d,.]+元", |
| | | r"到期还款日", |
| | | r"扣款.*?用于.*?还款", |
| | | r"扣款.*?用于.*?贷款", |
| | | r"扣款.*?用于.*?信用卡", |
| | | r"车贷还款", |
| | | r"房贷还款", |
| | | r"贷款还款", |
| | | r"信用卡账单", |
| | | r"下次还款日" |
| | | ] |
| | | |
| | | # 检查是否为收入短信 |
| | | for pattern in income_patterns: |
| | | if re.search(pattern, text): |
| | | logger.info(f"识别为收入短信: {text[:30]}...") |
| | | category = "收入" |
| | | details = model_manager.extract_income_entities(text) |
| | | save_sms_to_file(text, category, 1.0) |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": details |
| | | } |
| | | }) |
| | | |
| | | # 检查是否为还款短信 |
| | | for pattern in repayment_patterns: |
| | | if re.search(pattern, text): |
| | | # 二次检查:如果包含"已还清"或"已结清"等词,归类为"其他" |
| | | if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns): |
| | | logger.info(f"虽然识别为还款短信,但包含已还清/已结清,归类为其他: {text[:30]}...") |
| | | category = "其他" |
| | | save_sms_to_file(text, category, 1.0) |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": {} |
| | | } |
| | | }) |
| | | |
| | | logger.info(f"识别为还款短信: {text[:30]}...") |
| | | category = "还款" |
| | | details = model_manager.extract_repayment_entities(text) |
| | | save_sms_to_file(text, category, 1.0) |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": details |
| | | } |
| | | }) |
| | | |
| | | # 特定短信识别逻辑 - 针对百度通知和招商银行账单 |
| | | # 识别百度通知 |
| | | if "百度智能云" in text and "尊敬的用户" in text and "免费额度" in text: |
| | |
| | | |
| | | # 识别招商银行账单 |
| | | if "招商银行" in text and ("账单¥" in text or "账单¥" in text or "还款日" in text): |
| | | # 检查是否为已还清/已结清短信 |
| | | if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns): |
| | | logger.info(f"招商银行短信包含已还清/已结清,归类为其他: {text[:30]}...") |
| | | category = "其他" |
| | | save_sms_to_file(text, category, 1.0) |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": {} |
| | | } |
| | | }) |
| | | |
| | | logger.info(f"直接识别为招商银行还款短信: {text[:30]}...") |
| | | category = "还款" |
| | | details = model_manager.extract_repayment_entities(text) |
| | |
| | | "message": "服务器内部错误" |
| | | }), 500 |
| | | |
| | | |
| | | def run_douban_spider(type_, tag): |
| | | subprocess.run( |
| | | ["python", "douban.py", type_, tag], |
| | | capture_output=True, |
| | | text=True |
| | | ) |
| | | |
| | | def run_process_films(): |
| | | try: |
| | | process_films() |
| | | except Exception as e: |
| | | logger.error(f"处理电影内容失败: {str(e)}") |
| | | |
| | | @app.route("/crawl-douban", methods=["POST"]) |
| | | def crawl_douban(): |
| | | try: |
| | | default_params = {"type": "tv", "tag": "热门"} |
| | | |
| | | # 尝试解析 JSON,失败则使用默认值 |
| | | # 先检查 Content-Type 和数据结构 |
| | | if request.is_json: |
| | | params = request.get_json() |
| | | if not isinstance(params, dict): # 确保是字典类型 |
| | | params = default_params |
| | | else: |
| | | params = default_params |
| | | |
| | | # 获取参数(带默认值) |
| | | type_ = params.get("type", default_params["type"]) |
| | | tag = params.get("tag", default_params["tag"]) |
| | | # 打印接收到的参数(调试用) |
| | | print(f"Received params - type: {type_}, tag: {tag}") |
| | | |
| | | # 启动线程异步执行爬虫 |
| | | t = threading.Thread(target=run_douban_spider, args=(type_, tag)) |
| | | t.start() |
| | | |
| | | return jsonify({"status": "success", "message": "爬虫任务已启动", "type": type_, "tag": tag}) |
| | | except Exception as e: |
| | | return jsonify({"status": "error", "message": str(e)}), 500 |
| | | |
| | | @app.route('/generate-film-content', methods=['POST']) |
| | | def generate_film_content_api(): |
| | | """生成电影内容接口""" |
| | | try: |
| | | threading.Thread(target=run_process_films).start() |
| | | return jsonify({"status": "accepted", "message": "任务已异步启动"}) |
| | | except Exception as e: |
| | | return jsonify({"status": "error", "message": str(e)}), 500 |
| | | if __name__ == "__main__": |
| | | app.run(host="0.0.0.0", port=5000) |