From 90e496298b21f0594b85535ec47d26d2d7d5a9ed Mon Sep 17 00:00:00 2001
From: cloudroam <cloudroam>
Date: 星期四, 03 七月 2025 17:33:09 +0800
Subject: [PATCH] fix:生成的内容

---
 app.py |  200 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 185 insertions(+), 15 deletions(-)

diff --git a/app.py b/app.py
index 9a8237a..872cff9 100644
--- a/app.py
+++ b/app.py
@@ -203,13 +203,14 @@
         # 还款短信特征
         repayment_keywords = [
             "还款", "账单", "信用卡", "借款", "贷款", "逾期", "欠款", "最低还款",
-            "应还金额", "到期还款", "还清", "应还", "还款日", "账单¥", "账单¥", "查账还款"
+            "应还金额", "到期还款", "还清", "应还", "还款日", "账单¥", "账单¥", "查账还款",
+            "扣款用于", "房贷还款", "信用卡还款", "车贷还款", "应还款额", "最低还款额"
         ]
         
         # 收入短信特征
         income_keywords = [
             "收入", "转账", "入账", "到账", "支付", "工资", "报销", "余额", 
-            "成功收款", "收到", "款项"
+            "成功收款", "收到", "款项", "收入金额"
         ]
         
         # 航班/火车票特征
@@ -342,8 +343,15 @@
             # 快递关键词
             "快递", "包裹", "取件码", "取件", "签收", "派送", "配送",
             # 收入关键词
-            "收入", "转账", "入账", "到账", "支付成功", "工资"
+            "收入", "转账", "入账", "到账", "支付成功", "工资", "支付宝转账", "微信转账"
         ]
+        
+        # 先检查是否为收入短信(优先于通知判断)
+        income_indicators = ["收入", "入账", "转账", "工资"]
+        for indicator in income_indicators:
+            if indicator in text and "元" in text:
+                # 可能是收入短信,不要判断为通知
+                return False
         
         # 运营商余额通知特征
         telecom_balance_patterns = [
@@ -458,20 +466,62 @@
                     if word in code:
                         code = code[:code.index(word)]
                 
-                # 只保留字母、数字和连字符
-                code = ''.join(c for c in code if c.isalnum() or c == "-")
+                # 针对不同快递公司采取不同策略
+                if "丰巢" in text or "蜂巢" in text:
+                    # 对于丰巢快递,只保留数字
+                    code = ''.join(c for c in code if c.isdigit())
+                elif "中国邮政" in text:
+                    # 对于中国邮政,尝试提取完整货位号
+                    cargo_match = re.search(r'货号[::]\s*([0-9A-Za-z\-]+)', text)
+                    if cargo_match:
+                        code = cargo_match.group(1)
+                    else:
+                        # 保留原始格式(字母、数字和连字符)
+                        code = ''.join(c for c in code if c.isalnum() or c == "-")
+                else:
+                    # 对于其他快递,保留字母、数字和连字符
+                    code = ''.join(c for c in code if c.isalnum() or c == "-")
                 
                 # 确保格式正确
-                parts = code.split("-")
-                valid_parts = []
-                for part in parts:
-                    if part and any(c.isalnum() for c in part):
-                        valid_parts.append(part)
-                
-                if valid_parts:
-                    result["pickup_code"] = "-".join(valid_parts)
+                if "丰巢" in text or "蜂巢" in text:
+                    # 对于丰巢快递,只保留数字
+                    if code.isdigit():
+                        result["pickup_code"] = code
+                    else:
+                        # 如果没有数字,尝试再次从文本中匹配纯数字取件码
+                        pickup_code_match = re.search(r'码[^0-9]*(\d+)', text)
+                        if pickup_code_match:
+                            result["pickup_code"] = pickup_code_match.group(1)
+                        else:
+                            result["pickup_code"] = None
+                elif "中国邮政" in text:
+                    # 对于中国邮政,验证格式是否合理
+                    if re.match(r'[0-9A-Za-z\-]+', code):
+                        result["pickup_code"] = code
+                    else:
+                        # 二次尝试:从文本中直接获取货位号
+                        cargo_match = re.search(r'货号[::]\s*([0-9A-Za-z\-]+)', text)
+                        if cargo_match:
+                            result["pickup_code"] = cargo_match.group(1)
+                        else:
+                            # 最后尝试:提取"取件密码"后的纯数字序列
+                            password_match = re.search(r'密码\s*(\d+)', text)
+                            if password_match:
+                                result["pickup_code"] = password_match.group(1)
+                            else:
+                                result["pickup_code"] = None
                 else:
-                    result["pickup_code"] = None
+                    # 对于其他快递,保持原有逻辑
+                    parts = code.split("-")
+                    valid_parts = []
+                    for part in parts:
+                        if part and any(c.isalnum() for c in part):
+                            valid_parts.append(part)
+                    
+                    if valid_parts:
+                        result["pickup_code"] = "-".join(valid_parts)
+                    else:
+                        result["pickup_code"] = None
 
             # 清理公司名称
             if result["company"]:
@@ -480,7 +530,16 @@
                 for word in invalid_words:
                     if company.endswith(word):
                         company = company[:-len(word)]
+                
+                # 特殊处理中国邮政
+                if company == "中国" and "中国邮政" in text:
+                    company = "邮政"
+                elif "中国邮政" in text and not company:
+                    company = "邮政"
+                
                 result["company"] = company.strip()
+            elif "中国邮政" in text:  # 如果NER未识别但文本中有中国邮政
+                result["company"] = "邮政"
 
             # 清理地址
             if result["address"]:
@@ -623,7 +682,8 @@
                 
                 # 尝试查找最低还款金额
                 min_amount_match = re.search(r'最低还款([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', context)
-                if min_amount_match and "MIN_CODE" in current_entity["type"]:
+                # 修复:确保current_entity存在且有type属性再使用
+                if min_amount_match and 'current_entity' in locals() and current_entity is not None and "MIN_CODE" in current_entity["type"]:
                     return min_amount_match.group(1)  # 直接返回匹配到的最低还款金额,保留原始格式
                     
                 # 在上下文中查找完整金额
@@ -1360,6 +1420,103 @@
         # 保存原始短信内容到文件
         save_sms_to_file(text)
             
+        # 已还清或已结清的短信模式
+        already_paid_patterns = [
+            r"已还清",
+            r"已结清",
+            r"还款.*?入账后.*?已还清",
+            r"还款.*?入账.*?结清"
+        ]
+        
+        # 检查是否为已还清/已结清的短信
+        for pattern in already_paid_patterns:
+            if re.search(pattern, text):
+                logger.info(f"识别为已还清/已结清短信,归类为其他: {text[:30]}...")
+                category = "其他"
+                save_sms_to_file(text, category, 1.0)
+                return jsonify({
+                    "status": "success",
+                    "data": {
+                        "category": category,
+                        "details": {}
+                    }
+                })
+            
+        # 银行收入短信特征识别
+        income_patterns = [
+            r"收入金额[\d,.]+元",
+            r"账户.*?收入.*?[\d,.]+元",
+            r"账户.*?工资",
+            r"工资.*?收入",
+            r"入账[\d,.]+元",
+            # 添加新模式匹配南京银行等类似格式
+            r"收入\d+\.\d+元",
+            r"账号.*?收入\d+\.\d+元",
+            r"尾号\d+的账号.*?收入\d+\.\d+元",
+            r"支付宝转账",
+            r"转账.*?收入"
+        ]
+        
+        # 银行还款短信特征识别
+        repayment_patterns = [
+            r"信用卡.*?还款",
+            r"账单.*?[\d,.]+元",
+            r"应还款额.*?[\d,.]+元",
+            r"最低还款额.*?[\d,.]+元",
+            r"到期还款日",
+            r"扣款.*?用于.*?还款",
+            r"扣款.*?用于.*?贷款",
+            r"扣款.*?用于.*?信用卡",
+            r"车贷还款",
+            r"房贷还款",
+            r"贷款还款",
+            r"信用卡账单",
+            r"下次还款日"
+        ]
+        
+        # 检查是否为收入短信
+        for pattern in income_patterns:
+            if re.search(pattern, text):
+                logger.info(f"识别为收入短信: {text[:30]}...")
+                category = "收入"
+                details = model_manager.extract_income_entities(text)
+                save_sms_to_file(text, category, 1.0)
+                return jsonify({
+                    "status": "success",
+                    "data": {
+                        "category": category,
+                        "details": details
+                    }
+                })
+        
+        # 检查是否为还款短信
+        for pattern in repayment_patterns:
+            if re.search(pattern, text):
+                # 二次检查:如果包含"已还清"或"已结清"等词,归类为"其他"
+                if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns):
+                    logger.info(f"虽然识别为还款短信,但包含已还清/已结清,归类为其他: {text[:30]}...")
+                    category = "其他"
+                    save_sms_to_file(text, category, 1.0)
+                    return jsonify({
+                        "status": "success",
+                        "data": {
+                            "category": category,
+                            "details": {}
+                        }
+                    })
+                
+                logger.info(f"识别为还款短信: {text[:30]}...")
+                category = "还款"
+                details = model_manager.extract_repayment_entities(text)
+                save_sms_to_file(text, category, 1.0)
+                return jsonify({
+                    "status": "success",
+                    "data": {
+                        "category": category,
+                        "details": details
+                    }
+                })
+        
         # 特定短信识别逻辑 - 针对百度通知和招商银行账单
         # 识别百度通知
         if "百度智能云" in text and "尊敬的用户" in text and "免费额度" in text:
@@ -1376,6 +1533,19 @@
         
         # 识别招商银行账单
         if "招商银行" in text and ("账单¥" in text or "账单¥" in text or "还款日" in text):
+            # 检查是否为已还清/已结清短信
+            if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns):
+                logger.info(f"招商银行短信包含已还清/已结清,归类为其他: {text[:30]}...")
+                category = "其他"
+                save_sms_to_file(text, category, 1.0)
+                return jsonify({
+                    "status": "success",
+                    "data": {
+                        "category": category,
+                        "details": {}
+                    }
+                })
+                
             logger.info(f"直接识别为招商银行还款短信: {text[:30]}...")
             category = "还款"
             details = model_manager.extract_repayment_entities(text)

--
Gitblit v1.9.3