From 90e496298b21f0594b85535ec47d26d2d7d5a9ed Mon Sep 17 00:00:00 2001 From: cloudroam <cloudroam> Date: 星期四, 03 七月 2025 17:33:09 +0800 Subject: [PATCH] fix:生成的内容 --- app.py | 200 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 files changed, 185 insertions(+), 15 deletions(-) diff --git a/app.py b/app.py index 9a8237a..872cff9 100644 --- a/app.py +++ b/app.py @@ -203,13 +203,14 @@ # 还款短信特征 repayment_keywords = [ "还款", "账单", "信用卡", "借款", "贷款", "逾期", "欠款", "最低还款", - "应还金额", "到期还款", "还清", "应还", "还款日", "账单¥", "账单¥", "查账还款" + "应还金额", "到期还款", "还清", "应还", "还款日", "账单¥", "账单¥", "查账还款", + "扣款用于", "房贷还款", "信用卡还款", "车贷还款", "应还款额", "最低还款额" ] # 收入短信特征 income_keywords = [ "收入", "转账", "入账", "到账", "支付", "工资", "报销", "余额", - "成功收款", "收到", "款项" + "成功收款", "收到", "款项", "收入金额" ] # 航班/火车票特征 @@ -342,8 +343,15 @@ # 快递关键词 "快递", "包裹", "取件码", "取件", "签收", "派送", "配送", # 收入关键词 - "收入", "转账", "入账", "到账", "支付成功", "工资" + "收入", "转账", "入账", "到账", "支付成功", "工资", "支付宝转账", "微信转账" ] + + # 先检查是否为收入短信(优先于通知判断) + income_indicators = ["收入", "入账", "转账", "工资"] + for indicator in income_indicators: + if indicator in text and "元" in text: + # 可能是收入短信,不要判断为通知 + return False # 运营商余额通知特征 telecom_balance_patterns = [ @@ -458,20 +466,62 @@ if word in code: code = code[:code.index(word)] - # 只保留字母、数字和连字符 - code = ''.join(c for c in code if c.isalnum() or c == "-") + # 针对不同快递公司采取不同策略 + if "丰巢" in text or "蜂巢" in text: + # 对于丰巢快递,只保留数字 + code = ''.join(c for c in code if c.isdigit()) + elif "中国邮政" in text: + # 对于中国邮政,尝试提取完整货位号 + cargo_match = re.search(r'货号[::]\s*([0-9A-Za-z\-]+)', text) + if cargo_match: + code = cargo_match.group(1) + else: + # 保留原始格式(字母、数字和连字符) + code = ''.join(c for c in code if c.isalnum() or c == "-") + else: + # 对于其他快递,保留字母、数字和连字符 + code = ''.join(c for c in code if c.isalnum() or c == "-") # 确保格式正确 - parts = code.split("-") - valid_parts = [] - for part in parts: - if part and any(c.isalnum() for c in part): - valid_parts.append(part) - - if valid_parts: - result["pickup_code"] = "-".join(valid_parts) + if "丰巢" in text or "蜂巢" in text: + # 对于丰巢快递,只保留数字 + if code.isdigit(): + result["pickup_code"] = code + else: + # 如果没有数字,尝试再次从文本中匹配纯数字取件码 + pickup_code_match = re.search(r'码[^0-9]*(\d+)', text) + if pickup_code_match: + result["pickup_code"] = pickup_code_match.group(1) + else: + result["pickup_code"] = None + elif "中国邮政" in text: + # 对于中国邮政,验证格式是否合理 + if re.match(r'[0-9A-Za-z\-]+', code): + result["pickup_code"] = code + else: + # 二次尝试:从文本中直接获取货位号 + cargo_match = re.search(r'货号[::]\s*([0-9A-Za-z\-]+)', text) + if cargo_match: + result["pickup_code"] = cargo_match.group(1) + else: + # 最后尝试:提取"取件密码"后的纯数字序列 + password_match = re.search(r'密码\s*(\d+)', text) + if password_match: + result["pickup_code"] = password_match.group(1) + else: + result["pickup_code"] = None else: - result["pickup_code"] = None + # 对于其他快递,保持原有逻辑 + parts = code.split("-") + valid_parts = [] + for part in parts: + if part and any(c.isalnum() for c in part): + valid_parts.append(part) + + if valid_parts: + result["pickup_code"] = "-".join(valid_parts) + else: + result["pickup_code"] = None # 清理公司名称 if result["company"]: @@ -480,7 +530,16 @@ for word in invalid_words: if company.endswith(word): company = company[:-len(word)] + + # 特殊处理中国邮政 + if company == "中国" and "中国邮政" in text: + company = "邮政" + elif "中国邮政" in text and not company: + company = "邮政" + result["company"] = company.strip() + elif "中国邮政" in text: # 如果NER未识别但文本中有中国邮政 + result["company"] = "邮政" # 清理地址 if result["address"]: @@ -623,7 +682,8 @@ # 尝试查找最低还款金额 min_amount_match = re.search(r'最低还款([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', context) - if min_amount_match and "MIN_CODE" in current_entity["type"]: + # 修复:确保current_entity存在且有type属性再使用 + if min_amount_match and 'current_entity' in locals() and current_entity is not None and "MIN_CODE" in current_entity["type"]: return min_amount_match.group(1) # 直接返回匹配到的最低还款金额,保留原始格式 # 在上下文中查找完整金额 @@ -1360,6 +1420,103 @@ # 保存原始短信内容到文件 save_sms_to_file(text) + # 已还清或已结清的短信模式 + already_paid_patterns = [ + r"已还清", + r"已结清", + r"还款.*?入账后.*?已还清", + r"还款.*?入账.*?结清" + ] + + # 检查是否为已还清/已结清的短信 + for pattern in already_paid_patterns: + if re.search(pattern, text): + logger.info(f"识别为已还清/已结清短信,归类为其他: {text[:30]}...") + category = "其他" + save_sms_to_file(text, category, 1.0) + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": {} + } + }) + + # 银行收入短信特征识别 + income_patterns = [ + r"收入金额[\d,.]+元", + r"账户.*?收入.*?[\d,.]+元", + r"账户.*?工资", + r"工资.*?收入", + r"入账[\d,.]+元", + # 添加新模式匹配南京银行等类似格式 + r"收入\d+\.\d+元", + r"账号.*?收入\d+\.\d+元", + r"尾号\d+的账号.*?收入\d+\.\d+元", + r"支付宝转账", + r"转账.*?收入" + ] + + # 银行还款短信特征识别 + repayment_patterns = [ + r"信用卡.*?还款", + r"账单.*?[\d,.]+元", + r"应还款额.*?[\d,.]+元", + r"最低还款额.*?[\d,.]+元", + r"到期还款日", + r"扣款.*?用于.*?还款", + r"扣款.*?用于.*?贷款", + r"扣款.*?用于.*?信用卡", + r"车贷还款", + r"房贷还款", + r"贷款还款", + r"信用卡账单", + r"下次还款日" + ] + + # 检查是否为收入短信 + for pattern in income_patterns: + if re.search(pattern, text): + logger.info(f"识别为收入短信: {text[:30]}...") + category = "收入" + details = model_manager.extract_income_entities(text) + save_sms_to_file(text, category, 1.0) + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": details + } + }) + + # 检查是否为还款短信 + for pattern in repayment_patterns: + if re.search(pattern, text): + # 二次检查:如果包含"已还清"或"已结清"等词,归类为"其他" + if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns): + logger.info(f"虽然识别为还款短信,但包含已还清/已结清,归类为其他: {text[:30]}...") + category = "其他" + save_sms_to_file(text, category, 1.0) + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": {} + } + }) + + logger.info(f"识别为还款短信: {text[:30]}...") + category = "还款" + details = model_manager.extract_repayment_entities(text) + save_sms_to_file(text, category, 1.0) + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": details + } + }) + # 特定短信识别逻辑 - 针对百度通知和招商银行账单 # 识别百度通知 if "百度智能云" in text and "尊敬的用户" in text and "免费额度" in text: @@ -1376,6 +1533,19 @@ # 识别招商银行账单 if "招商银行" in text and ("账单¥" in text or "账单¥" in text or "还款日" in text): + # 检查是否为已还清/已结清短信 + if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns): + logger.info(f"招商银行短信包含已还清/已结清,归类为其他: {text[:30]}...") + category = "其他" + save_sms_to_file(text, category, 1.0) + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": {} + } + }) + logger.info(f"直接识别为招商银行还款短信: {text[:30]}...") category = "还款" details = model_manager.extract_repayment_entities(text) -- Gitblit v1.9.3