cloudroam
7 天以前 90e496298b21f0594b85535ec47d26d2d7d5a9ed
app.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import os
import logging
import datetime
from typing import Dict, Optional, Tuple
from flask import Flask, request, jsonify
@@ -124,8 +125,8 @@
            logger.error(f"加载火车票模型失败: {str(e)}")
            raise
    def classify_sms(self, text: str) -> str:
        """对短信进行分类"""
    def classify_sms(self, text: str) -> Tuple[str, float]:
        """对短信进行分类,并返回置信度"""
        try:
            inputs = self.classifier_tokenizer(
                text, 
@@ -135,11 +136,251 @@
            )
            with torch.no_grad():
                outputs = self.classifier_model(**inputs)
            pred_id = outputs.logits.argmax().item()
            return self.classifier_model.config.id2label[pred_id]
            # 获取预测标签及其对应的概率
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            pred_id = logits.argmax().item()
            confidence = probabilities[0, pred_id].item()  # 获取预测标签的置信度
            return self.classifier_model.config.id2label[pred_id], confidence
        except Exception as e:
            logger.error(f"短信分类失败: {str(e)}")
            raise
    def is_marketing_sms(self, text: str) -> bool:
        """判断是否为营销/广告类短信,采用评分系统"""
        # 特定字符串模式检查:直接匹配明显的营销/通知短信
        marketing_patterns = [
            # 百度类通知
            r"百度智能云.*?尊敬的用户",
            r"百度.*?账户.*?tokens",
            r"AppBuilder.*?账户",
            r"账户有.*?免费额度",
            r".*?免费额度.*?过期",
            r"dwz\.cn\/[A-Za-z0-9]+"
        ]
        # 对特定模式直接判断
        for pattern in marketing_patterns:
            if re.search(pattern, text):
                return True  # 直接认为是营销短信
        # 评分系统:根据短信内容特征进行评分,超过阈值判定为营销短信
        score = 0
        # 强营销特征关键词(高权重)
        strong_marketing_keywords = [
            "有奖", "免费赠送", "抽奖", "中奖", "优惠券", "折扣券", "特价", "秒杀",
            "限时抢购", "促销", "推广", "广告", "代金券", "0元购", "tokens调用量"
        ]
        # 一般营销特征关键词(中等权重)
        general_marketing_keywords = [
            "活动", "优惠", "折扣", "限时", "抢购", "特价", "promotion", "推广",
            "开业", "集点", "集赞", "关注", "公众号", "小程序", "注册有礼", "免费额度"
        ]
        # 弱营销特征关键词(低权重,可能出现在正常短信中)
        weak_marketing_keywords = [
            "尊敬的用户", "尊敬的客户", "您好", "注册", "登录", "账户", "账号",
            "会员", "积分", "权益", "提醒", "即将", "有效期", "过期", "升级",
            "更新", "下载", "APP", "应用", "平台", "网址", "点击", "工单"
        ]
        # 短网址和链接(独立评估,结合其他特征判断)
        url_patterns = [
            "dwz.cn", "t.cn", "短网址", "http://", "https://", "cmbt.cn"
        ]
        # 业务短信特征(用于反向识别,降低误判率)
        # 快递短信特征
        express_keywords = [
            "快递", "包裹", "取件码", "取件", "签收", "派送", "配送", "物流",
            "驿站", "在途", "揽收", "暂存", "已到达", "丰巢", "柜取件", "柜机"
        ]
        # 还款短信特征
        repayment_keywords = [
            "还款", "账单", "信用卡", "借款", "贷款", "逾期", "欠款", "最低还款",
            "应还金额", "到期还款", "还清", "应还", "还款日", "账单¥", "账单¥", "查账还款",
            "扣款用于", "房贷还款", "信用卡还款", "车贷还款", "应还款额", "最低还款额"
        ]
        # 收入短信特征
        income_keywords = [
            "收入", "转账", "入账", "到账", "支付", "工资", "报销", "余额",
            "成功收款", "收到", "款项", "收入金额"
        ]
        # 航班/火车票特征
        travel_keywords = [
            "航班", "航空", "飞机", "机票", "火车", "铁路", "列车", "车票",
            "出发", "抵达", "起飞", "登机", "候车", "检票"
        ]
        # 额外增加:通知类短信特征(通常不需要处理的短信)
        notification_keywords = [
            "余额不足", "话费不足", "话费余额", "通讯费", "流量用尽", "流量不足",
            "停机", "恢复通话", "自动充值", "交费", "缴费",
            "消费提醒", "交易提醒", "动账", "短信通知", "验证码", "校验码", "安全码"
        ]
        # 运营商标识
        telecom_keywords = [
            "中国电信", "中国移动", "中国联通", "电信", "移动", "联通",
            "携号转网", "号码服务", "通讯服务", "189.cn", "10086", "10010"
        ]
        # 银行和金融机构标识
        bank_keywords = [
            "信用卡", "储蓄卡", "借记卡", "储蓄", "银联",
            "建设银行", "工商银行", "农业银行", "中国银行", "交通银行",
            "招商银行", "浦发银行", "民生银行", "兴业银行", "广发银行",
            "平安银行", "中信银行", "光大银行", "华夏银行", "邮储银行",
            "农商银行", "支付宝", "微信支付", "京东金融", "度小满", "陆金所"
        ]
        # 特殊情况检查:招商银行账单短信,不应被过滤
        if ("招商银行" in text and ("账单" in text or "还款日" in text)) or "cmbt.cn" in text:
            if "还款" in text or "账单" in text or "消费卡" in text:
                return False  # 是还款短信,不过滤
        # 计算评分
        # 首先检查业务短信特征,如果明确是业务短信,直接返回False
        has_express_feature = any(keyword in text for keyword in express_keywords)
        has_repayment_feature = any(keyword in text for keyword in repayment_keywords)
        has_income_feature = any(keyword in text for keyword in income_keywords)
        has_travel_feature = any(keyword in text for keyword in travel_keywords)
        # 检查是否为百度通知
        is_baidu_notification = "百度" in text and "尊敬的用户" in text
        if is_baidu_notification:
            return True  # 百度通知应被过滤
        # 如果短信中包含多个业务关键词(≥2个),很可能是重要的业务短信
        business_score = (has_express_feature + has_repayment_feature +
                         has_income_feature + has_travel_feature)
        if business_score >= 2 and not is_baidu_notification:
            return False  # 多个业务特征同时存在,不太可能是营销短信
        # 检查强营销特征
        for keyword in strong_marketing_keywords:
            if keyword in text:
                score += 3
        # 检查一般营销特征
        for keyword in general_marketing_keywords:
            if keyword in text:
                score += 2
        # 检查弱营销特征
        for keyword in weak_marketing_keywords:
            if keyword in text:
                score += 1
        # 检查URL特征(结合是否存在业务特征)
        has_url = any(pattern in text for pattern in url_patterns)
        # 降低业务特征短信的营销判定分数
        if has_express_feature and not is_baidu_notification:
            score -= 3  # 快递特征明显减分
        if has_repayment_feature:
            score -= 3  # 还款特征明显减分
        if has_income_feature:
            score -= 2  # 收入特征减分
        if has_travel_feature:
            score -= 2  # 旅行特征减分
        # 检查通知类短信特征(但不包括重要的业务短信)
        if not has_express_feature and not has_repayment_feature:  # 确保不是快递和还款短信
            notification_count = sum(1 for keyword in notification_keywords if keyword in text)
            if notification_count >= 2:  # 需要至少2个通知关键词才判定
                score += notification_count  # 增加判定为营销/通知短信的可能性
        # 检查运营商和银行标识(结合其他特征判断)
        has_telecom_feature = any(keyword in text for keyword in telecom_keywords)
        has_bank_feature = any(keyword in text for keyword in bank_keywords)
        # URL的评分处理
        if has_url:
            if (has_express_feature or has_repayment_feature or has_income_feature or has_travel_feature) and not is_baidu_notification:
                # URL在业务短信中可能是正常的追踪链接,不增加评分
                pass
            else:
                # 纯URL且无业务特征,可能是营销短信
                score += 2
        # 特殊情况:运营商余额通知
        if has_telecom_feature and "余额" in text and not has_income_feature:
            score += 2
        # 设置判定阈值
        threshold = 4  # 需要至少4分才判定为营销短信
        return score >= threshold
    def is_notification_sms(self, text: str) -> bool:
        """判断是否为通知类短信(如银行交易通知、运营商提醒等)"""
        # 银行交易通知特征(不包括还款提醒)
        bank_transaction_patterns = [
            r"您尾号\d+的.+消费",
            r"您.+账户消费[\d,.]+元",
            r"交易[\d,.]+元",
            r"支付宝.+消费",
            r"微信支付.+消费",
            r"\d{1,2}月\d{1,2}日\d{1,2}[::]\d{1,2}消费",
            r"银行卡([支付|消费|扣款])"
        ]
        # 排除规则:包含以下关键词的短信不应被判定为通知短信
        business_keywords = [
            # 还款关键词
            "还款", "账单", "应还", "到期还款", "还款日", "最低还款", "账单¥", "账单¥", "查账还款",
            # 快递关键词
            "快递", "包裹", "取件码", "取件", "签收", "派送", "配送",
            # 收入关键词
            "收入", "转账", "入账", "到账", "支付成功", "工资", "支付宝转账", "微信转账"
        ]
        # 先检查是否为收入短信(优先于通知判断)
        income_indicators = ["收入", "入账", "转账", "工资"]
        for indicator in income_indicators:
            if indicator in text and "元" in text:
                # 可能是收入短信,不要判断为通知
                return False
        # 运营商余额通知特征
        telecom_balance_patterns = [
            r"余额[不足|低于][\d,.]+元",
            r"话费[不足|仅剩][\d,.]+元",
            r"流量[不足|即将用尽]",
            r"[电信|移动|联通].+余额",
            r"[停机|停号]提醒",
            r"为了保障您的正常通讯",
        ]
        # 首先检查是否包含业务关键词,有则不应判定为通知短信
        for keyword in business_keywords:
            if keyword in text:
                return False  # 包含业务关键词,不是需要过滤的通知短信
        # 检查银行交易通知模式
        for pattern in bank_transaction_patterns:
            if re.search(pattern, text):
                logger.debug(f"识别到银行交易通知短信:{text[:30]}...")
                return True
        # 检查运营商余额通知模式
        for pattern in telecom_balance_patterns:
            if re.search(pattern, text):
                logger.debug(f"识别到运营商余额通知短信:{text[:30]}...")
                return True
        return False
    def extract_entities(self, text: str) -> Dict[str, Optional[str]]:
        """提取文本中的实体"""
@@ -225,20 +466,62 @@
                    if word in code:
                        code = code[:code.index(word)]
                
                # 只保留字母、数字和连字符
                code = ''.join(c for c in code if c.isalnum() or c == "-")
                # 针对不同快递公司采取不同策略
                if "丰巢" in text or "蜂巢" in text:
                    # 对于丰巢快递,只保留数字
                    code = ''.join(c for c in code if c.isdigit())
                elif "中国邮政" in text:
                    # 对于中国邮政,尝试提取完整货位号
                    cargo_match = re.search(r'货号[::]\s*([0-9A-Za-z\-]+)', text)
                    if cargo_match:
                        code = cargo_match.group(1)
                    else:
                        # 保留原始格式(字母、数字和连字符)
                        code = ''.join(c for c in code if c.isalnum() or c == "-")
                else:
                    # 对于其他快递,保留字母、数字和连字符
                    code = ''.join(c for c in code if c.isalnum() or c == "-")
                
                # 确保格式正确
                parts = code.split("-")
                valid_parts = []
                for part in parts:
                    if part and any(c.isalnum() for c in part):
                        valid_parts.append(part)
                if valid_parts:
                    result["pickup_code"] = "-".join(valid_parts)
                if "丰巢" in text or "蜂巢" in text:
                    # 对于丰巢快递,只保留数字
                    if code.isdigit():
                        result["pickup_code"] = code
                    else:
                        # 如果没有数字,尝试再次从文本中匹配纯数字取件码
                        pickup_code_match = re.search(r'码[^0-9]*(\d+)', text)
                        if pickup_code_match:
                            result["pickup_code"] = pickup_code_match.group(1)
                        else:
                            result["pickup_code"] = None
                elif "中国邮政" in text:
                    # 对于中国邮政,验证格式是否合理
                    if re.match(r'[0-9A-Za-z\-]+', code):
                        result["pickup_code"] = code
                    else:
                        # 二次尝试:从文本中直接获取货位号
                        cargo_match = re.search(r'货号[::]\s*([0-9A-Za-z\-]+)', text)
                        if cargo_match:
                            result["pickup_code"] = cargo_match.group(1)
                        else:
                            # 最后尝试:提取"取件密码"后的纯数字序列
                            password_match = re.search(r'密码\s*(\d+)', text)
                            if password_match:
                                result["pickup_code"] = password_match.group(1)
                            else:
                                result["pickup_code"] = None
                else:
                    result["pickup_code"] = None
                    # 对于其他快递,保持原有逻辑
                    parts = code.split("-")
                    valid_parts = []
                    for part in parts:
                        if part and any(c.isalnum() for c in part):
                            valid_parts.append(part)
                    if valid_parts:
                        result["pickup_code"] = "-".join(valid_parts)
                    else:
                        result["pickup_code"] = None
            # 清理公司名称
            if result["company"]:
@@ -247,7 +530,16 @@
                for word in invalid_words:
                    if company.endswith(word):
                        company = company[:-len(word)]
                # 特殊处理中国邮政
                if company == "中国" and "中国邮政" in text:
                    company = "邮政"
                elif "中国邮政" in text and not company:
                    company = "邮政"
                result["company"] = company.strip()
            elif "中国邮政" in text:  # 如果NER未识别但文本中有中国邮政
                result["company"] = "邮政"
            # 清理地址
            if result["address"]:
@@ -390,7 +682,8 @@
                
                # 尝试查找最低还款金额
                min_amount_match = re.search(r'最低还款([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', context)
                if min_amount_match and "MIN_CODE" in current_entity["type"]:
                # 修复:确保current_entity存在且有type属性再使用
                if min_amount_match and 'current_entity' in locals() and current_entity is not None and "MIN_CODE" in current_entity["type"]:
                    return min_amount_match.group(1)  # 直接返回匹配到的最低还款金额,保留原始格式
                    
                # 在上下文中查找完整金额
@@ -504,8 +797,15 @@
                    result["date"] = date
            # 处理金额
            # 先尝试使用正则表达式直接匹配金额
            amount_match = re.search(r'(?:应还|还款)?金额([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', text)
            # 尝试匹配带¥符号的账单金额模式
            amount_match = re.search(r'账单¥([\d,]+\.?\d*)', text)
            if not amount_match:
                # 尝试匹配带¥符号的账单金额模式
                amount_match = re.search(r'账单¥([\d,]+\.?\d*)', text)
            if not amount_match:
                # 尝试匹配一般金额模式
                amount_match = re.search(r'(?:应还|还款)?金额([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', text)
            if amount_match:
                amount = amount_match.group(1)  # 保留原始格式(带逗号)
                # 验证金额有效性
@@ -531,9 +831,13 @@
                
                # 如果还是没有找到,尝试从文本中提取
                if not amount_candidates:
                    # 使用更宽松的正则表达式匹配金额
                    amount_pattern = re.compile(r'([\d,]+\.?\d*)(?:元|块钱|块|万元|万)')
                    matches = list(amount_pattern.finditer(text))
                    # 使用多个正则表达式匹配不同格式的金额
                    # 1. 匹配带¥符号格式
                    matches = list(re.finditer(r'¥([\d,]+\.?\d*)', text))
                    # 2. 匹配带¥符号格式
                    matches.extend(list(re.finditer(r'¥([\d,]+\.?\d*)', text)))
                    # 3. 匹配一般金额格式
                    matches.extend(list(re.finditer(r'([\d,]+\.?\d*)(?:元|块钱|块|万元|万)', text)))
                    
                    for match in matches:
                        amount_text = match.group(1)  # 获取数字部分,保留逗号
@@ -711,37 +1015,116 @@
                    result["datetime"] = datetime
            # 处理收入金额
            amount_candidates = []
            # 首先从识别的实体中获取
            for amount in entities["PICKUP_CODE"]:
                cleaned_amount = clean_amount(amount, text)
                if cleaned_amount:
                    try:
                        value = float(cleaned_amount)
                        amount_candidates.append((cleaned_amount, value))
                    except ValueError:
                        continue
            # 先尝试使用正则表达式直接匹配收入金额,包括"收入金额"格式
            amount_match = re.search(r'收入金额([\d,]+\.?\d*)元', text)
            if not amount_match:
                # 尝试匹配一般收入格式
                amount_match = re.search(r'收入([\d,]+\.?\d*)元', text)
            
            # 如果没有找到有效金额,直接从文本中尝试提取
            if not amount_candidates:
                # 直接在整个文本中寻找金额模式
                amount_pattern = re.compile(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)')
                matches = list(amount_pattern.finditer(text))
            if amount_match:
                amount = amount_match.group(1)  # 保留原始格式(带逗号)
                # 验证金额有效性
                try:
                    value = float(amount.replace(',', ''))
                    if value > 0:
                        result["amount"] = amount
                except ValueError:
                    pass
            # 如果正则没有匹配到,继续尝试NER结果
            if not result["amount"]:
                amount_candidates = []
                # 首先从识别的实体中获取
                for amount in entities["PICKUP_CODE"]:
                    cleaned_amount = clean_amount(amount, text)
                    if cleaned_amount:
                        try:
                            value = float(cleaned_amount)
                            amount_candidates.append((cleaned_amount, value))
                        except ValueError:
                            continue
                
                for match in matches:
                    amount_text = match.group(1)
                # 如果没有找到有效金额,直接从文本中尝试提取
                if not amount_candidates:
                    # 尝试多种模式匹配金额
                    # 1. 匹配"收入金额xxx元"模式
                    matches = list(re.finditer(r'收入金额([\d,]+\.?\d*)元', text))
                    # 2. 匹配"收入xxx元"模式
                    matches.extend(list(re.finditer(r'收入([\d,]+\.?\d*)元', text)))
                    # 3. 匹配带元结尾的金额
                    matches.extend(list(re.finditer(r'([0-9,]+\.[0-9]+)元', text)))
                    # 4. 匹配普通数字(可能是余额),但排除已识别为余额的金额
                    if "余额" in text:
                        balance_match = re.search(r'余额([\d,]+\.?\d*)元', text)
                        if balance_match:
                            balance_value = balance_match.group(1)
                            # 只匹配不等于余额的金额
                            all_numbers = re.finditer(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)', text)
                            for match in all_numbers:
                                if match.group(1) != balance_value:
                                    matches.append(match)
                    else:
                        matches.extend(list(re.finditer(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)', text)))
                    for match in matches:
                        amount_text = match.group(1)
                        try:
                            value = float(amount_text.replace(',', ''))
                            amount_candidates.append((amount_text, value))
                        except ValueError:
                            continue
                # 从金额候选中排除已识别的余额值
                if result["balance"]:
                    try:
                        value = float(amount_text.replace(',', ''))
                        amount_candidates.append((amount_text, value))
                        balance_value = float(result["balance"].replace(',', ''))
                        amount_candidates = [(text, value) for text, value in amount_candidates if abs(value - balance_value) > 0.01]
                    except ValueError:
                        continue
            # 选择最合适的有效金额
            if amount_candidates:
                result["amount"] = max(amount_candidates, key=lambda x: x[1])[0]
                        pass
                # 选择适当的金额作为收入
                if amount_candidates:
                    has_income_amount_keyword = "收入金额" in text
                    if has_income_amount_keyword:
                        # 查找"收入金额"附近的数字
                        idx = text.find("收入金额")
                        if idx != -1:
                            closest_amount = None
                            min_distance = float('inf')
                            for amount_text, value in amount_candidates:
                                # 找到这个数字在原文中的位置
                                amount_idx = text.find(amount_text)
                                if amount_idx != -1:
                                    distance = abs(amount_idx - idx)
                                    if distance < min_distance:
                                        min_distance = distance
                                        closest_amount = amount_text
                            if closest_amount:
                                result["amount"] = closest_amount
                            else:
                                # 如果无法找到最近的金额,使用最大金额策略
                                result["amount"] = max(amount_candidates, key=lambda x: x[1])[0]
                    else:
                        # 如果没有"收入金额"关键词,则使用最大金额策略
                        result["amount"] = max(amount_candidates, key=lambda x: x[1])[0]
            # 处理余额
            if entities["BALANCE"]:
            # 先尝试使用正则表达式直接匹配余额
            balance_match = re.search(r'余额([\d,]+\.?\d*)元', text)
            if balance_match:
                balance = balance_match.group(1)  # 保留原始格式(带逗号)
                # 验证金额有效性
                try:
                    value = float(balance.replace(',', ''))
                    if value > 0:
                        result["balance"] = balance
                except ValueError:
                    pass
            # 如果正则没有匹配到,使用NER结果
            if not result["balance"] and entities["BALANCE"]:
                for amount in entities["BALANCE"]:
                    cleaned_amount = clean_amount(amount, text)
                    if cleaned_amount:
@@ -974,6 +1357,45 @@
app = Flask(__name__)
model_manager = ModelManager()
# 添加保存短信到文件的函数
def save_sms_to_file(text: str, category: str = None, confidence: float = None) -> bool:
    """
    将短信内容保存到本地文件
    Args:
        text: 短信内容
        category: 分类结果
        confidence: 分类置信度
    Returns:
        bool: 保存成功返回True,否则返回False
    """
    try:
        # 确保日志目录存在
        log_dir = "./sms_logs"
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        # 创建基于日期的文件名
        today = datetime.datetime.now().strftime("%Y-%m-%d")
        file_path = os.path.join(log_dir, f"sms_log_{today}.txt")
        # 获取当前时间
        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        # 准备要写入的内容
        category_info = f"分类: {category}, 置信度: {confidence:.4f}" if category and confidence else "未分类"
        log_content = f"[{current_time}] {category_info}\n{text}\n{'='*50}\n"
        # 以追加模式写入文件
        with open(file_path, 'a', encoding='utf-8') as f:
            f.write(log_content)
        return True
    except Exception as e:
        logger.error(f"保存短信到文件失败: {str(e)}")
        return False
@app.route("/health", methods=["GET"])
def health_check():
    """健康检查接口"""
@@ -994,9 +1416,233 @@
        text = data["content"]
        if not isinstance(text, str) or not text.strip():
            raise BadRequest("短信内容不能为空")
        # 保存原始短信内容到文件
        save_sms_to_file(text)
        # 已还清或已结清的短信模式
        already_paid_patterns = [
            r"已还清",
            r"已结清",
            r"还款.*?入账后.*?已还清",
            r"还款.*?入账.*?结清"
        ]
        # 检查是否为已还清/已结清的短信
        for pattern in already_paid_patterns:
            if re.search(pattern, text):
                logger.info(f"识别为已还清/已结清短信,归类为其他: {text[:30]}...")
                category = "其他"
                save_sms_to_file(text, category, 1.0)
                return jsonify({
                    "status": "success",
                    "data": {
                        "category": category,
                        "details": {}
                    }
                })
        # 银行收入短信特征识别
        income_patterns = [
            r"收入金额[\d,.]+元",
            r"账户.*?收入.*?[\d,.]+元",
            r"账户.*?工资",
            r"工资.*?收入",
            r"入账[\d,.]+元",
            # 添加新模式匹配南京银行等类似格式
            r"收入\d+\.\d+元",
            r"账号.*?收入\d+\.\d+元",
            r"尾号\d+的账号.*?收入\d+\.\d+元",
            r"支付宝转账",
            r"转账.*?收入"
        ]
        # 银行还款短信特征识别
        repayment_patterns = [
            r"信用卡.*?还款",
            r"账单.*?[\d,.]+元",
            r"应还款额.*?[\d,.]+元",
            r"最低还款额.*?[\d,.]+元",
            r"到期还款日",
            r"扣款.*?用于.*?还款",
            r"扣款.*?用于.*?贷款",
            r"扣款.*?用于.*?信用卡",
            r"车贷还款",
            r"房贷还款",
            r"贷款还款",
            r"信用卡账单",
            r"下次还款日"
        ]
        # 检查是否为收入短信
        for pattern in income_patterns:
            if re.search(pattern, text):
                logger.info(f"识别为收入短信: {text[:30]}...")
                category = "收入"
                details = model_manager.extract_income_entities(text)
                save_sms_to_file(text, category, 1.0)
                return jsonify({
                    "status": "success",
                    "data": {
                        "category": category,
                        "details": details
                    }
                })
        # 检查是否为还款短信
        for pattern in repayment_patterns:
            if re.search(pattern, text):
                # 二次检查:如果包含"已还清"或"已结清"等词,归类为"其他"
                if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns):
                    logger.info(f"虽然识别为还款短信,但包含已还清/已结清,归类为其他: {text[:30]}...")
                    category = "其他"
                    save_sms_to_file(text, category, 1.0)
                    return jsonify({
                        "status": "success",
                        "data": {
                            "category": category,
                            "details": {}
                        }
                    })
                logger.info(f"识别为还款短信: {text[:30]}...")
                category = "还款"
                details = model_manager.extract_repayment_entities(text)
                save_sms_to_file(text, category, 1.0)
                return jsonify({
                    "status": "success",
                    "data": {
                        "category": category,
                        "details": details
                    }
                })
        # 特定短信识别逻辑 - 针对百度通知和招商银行账单
        # 识别百度通知
        if "百度智能云" in text and "尊敬的用户" in text and "免费额度" in text:
            logger.info(f"直接识别为百度通知短信: {text[:30]}...")
            category = "其他"
            save_sms_to_file(text, category, 1.0)  # 记录分类结果
            return jsonify({
                "status": "success",
                "data": {
                    "category": category,
                    "details": {}
                }
            })
        # 识别招商银行账单
        if "招商银行" in text and ("账单¥" in text or "账单¥" in text or "还款日" in text):
            # 检查是否为已还清/已结清短信
            if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns):
                logger.info(f"招商银行短信包含已还清/已结清,归类为其他: {text[:30]}...")
                category = "其他"
                save_sms_to_file(text, category, 1.0)
                return jsonify({
                    "status": "success",
                    "data": {
                        "category": category,
                        "details": {}
                    }
                })
            logger.info(f"直接识别为招商银行还款短信: {text[:30]}...")
            category = "还款"
            details = model_manager.extract_repayment_entities(text)
            save_sms_to_file(text, category, 1.0)  # 记录分类结果
            return jsonify({
                "status": "success",
                "data": {
                    "category": category,
                    "details": details
                }
            })
        # 处理短信
        category = model_manager.classify_sms(text)
        category, confidence = model_manager.classify_sms(text)
        # 保存短信内容和分类结果
        save_sms_to_file(text, category, confidence)
        # 如果是明确的业务短信类别,直接进入处理流程
        if category in ["快递", "还款", "收入", "航班", "火车票"] and confidence > 0.5:
            # 对百度通知的特殊处理
            if category == "快递" and "百度" in text and "尊敬的用户" in text:
                logger.info(f"纠正百度通知短信的分类: {text[:30]}...")
                category = "其他"
                save_sms_to_file(text, category, confidence)  # 更新分类结果
                return jsonify({
                    "status": "success",
                    "data": {
                        "category": category,
                        "details": {}
                    }
                })
            # 对于高置信度的业务分类,直接进入实体提取
            if category == "快递":
                details = model_manager.extract_entities(text)
            elif category == "还款":
                details = model_manager.extract_repayment_entities(text)
            elif category == "收入":
                details = model_manager.extract_income_entities(text)
            elif category == "航班":
                details = model_manager.extract_flight_entities(text)
            elif category == "火车票":
                details = model_manager.extract_train_entities(text)
            logger.info(f"高置信度业务短信: {text[:30]}..., category: {category}, confidence: {confidence:.4f}")
            return jsonify({
                "status": "success",
                "data": {
                    "category": category,
                    "details": details
                }
            })
        # 检查是否为营销/广告短信
        if model_manager.is_marketing_sms(text):
            # 如果是营销/广告短信,直接归类为"其他"
            logger.info(f"检测到营销/广告短信: {text[:30]}...")
            category = "其他"
            save_sms_to_file(text, category, confidence)  # 更新分类结果
            return jsonify({
                "status": "success",
                "data": {
                    "category": category,
                    "details": {}
                }
            })
        # 检查是否为通知类短信
        if model_manager.is_notification_sms(text):
            # 如果是通知类短信,直接归类为"其他"
            logger.info(f"检测到通知类短信: {text[:30]}...")
            category = "其他"
            save_sms_to_file(text, category, confidence)  # 更新分类结果
            return jsonify({
                "status": "success",
                "data": {
                    "category": category,
                    "details": {}
                }
            })
        # 置信度阈值,低于此阈值的分类结果被视为"其他"
        confidence_threshold = 0.7
        if confidence < confidence_threshold:
            logger.info(f"短信分类置信度低({confidence:.4f}),归类为'其他': {text[:30]}...")
            category = "其他"
            save_sms_to_file(text, category, confidence)  # 更新分类结果
            return jsonify({
                "status": "success",
                "data": {
                    "category": category,
                    "details": {}
                }
            })
        # 根据分类结果调用对应的实体提取函数
        if category == "快递":
            details = model_manager.extract_entities(text)
        elif category == "还款":
@@ -1005,13 +1651,13 @@
            details = model_manager.extract_income_entities(text)
        elif category == "航班":
            details = model_manager.extract_flight_entities(text)
        elif category == "火车票":  # 添加火车票类别处理
        elif category == "火车票":
            details = model_manager.extract_train_entities(text)
        else:
            details = {}
        
        # 记录处理结果
        logger.info(f"Successfully processed SMS: {text[:30]}...")
        logger.info(f"Successfully processed SMS: {text[:30]}..., category: {category}, confidence: {confidence:.4f}")
        
        return jsonify({
            "status": "success",
@@ -1020,7 +1666,7 @@
                "details": details
            }
        })
        save_sms_to_file
    except BadRequest as e:
        logger.warning(f"Invalid request: {str(e)}")
        return jsonify({