From e6fed94443177826cf7497a85e9cdcfc7c43ee21 Mon Sep 17 00:00:00 2001 From: cloudroam <cloudroam> Date: 星期一, 21 四月 2025 16:49:49 +0800 Subject: [PATCH] fix --- app.py | 718 ++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 files changed, 628 insertions(+), 90 deletions(-) diff --git a/app.py b/app.py index 47aa332..9a8237a 100644 --- a/app.py +++ b/app.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os import logging +import datetime from typing import Dict, Optional, Tuple from flask import Flask, request, jsonify @@ -27,8 +28,8 @@ self.ner_path = "./models/ner_model/best_model" self.repayment_path = "./models/repayment_model/best_model" self.income_path = "./models/income_model/best_model" - self.flight_path = "./models/flight_model/best_model" - self.train_path = "./models/train_model/best_model" # 添加火车票模型路径 + # self.flight_path = "./models/flight_model/best_model" + # self.train_path = "./models/train_model/best_model" # 添加火车票模型路径 # 检查模型文件 self._check_model_files() @@ -38,16 +39,16 @@ self.ner_tokenizer, self.ner_model = self._load_ner() self.repayment_tokenizer, self.repayment_model = self._load_repayment() self.income_tokenizer, self.income_model = self._load_income() - self.flight_tokenizer, self.flight_model = self._load_flight() - self.train_tokenizer, self.train_model = self._load_train() # 加载火车票模型 + # self.flight_tokenizer, self.flight_model = self._load_flight() + # self.train_tokenizer, self.train_model = self._load_train() # 加载火车票模型 # 将模型设置为评估模式 self.classifier_model.eval() self.ner_model.eval() self.repayment_model.eval() self.income_model.eval() - self.flight_model.eval() - self.train_model.eval() # 设置火车票模型为评估模式 + # self.flight_model.eval() + # self.train_model.eval() # 设置火车票模型为评估模式 def _check_model_files(self): """检查模型文件是否存在""" @@ -59,10 +60,10 @@ raise RuntimeError("还款模型文件不存在,请先运行训练脚本") if not os.path.exists(self.income_path): raise RuntimeError("收入模型文件不存在,请先运行训练脚本") - if not os.path.exists(self.flight_path): - raise RuntimeError("航班模型文件不存在,请先运行训练脚本") - if not os.path.exists(self.train_path): - raise RuntimeError("火车票模型文件不存在,请先运行训练脚本") + # if not os.path.exists(self.flight_path): + # raise RuntimeError("航班模型文件不存在,请先运行训练脚本") + # if not os.path.exists(self.train_path): + # raise RuntimeError("火车票模型文件不存在,请先运行训练脚本") def _load_classifier(self) -> Tuple[BertTokenizer, BertForSequenceClassification]: """加载分类模型""" @@ -124,8 +125,8 @@ logger.error(f"加载火车票模型失败: {str(e)}") raise - def classify_sms(self, text: str) -> str: - """对短信进行分类""" + def classify_sms(self, text: str) -> Tuple[str, float]: + """对短信进行分类,并返回置信度""" try: inputs = self.classifier_tokenizer( text, @@ -135,11 +136,243 @@ ) with torch.no_grad(): outputs = self.classifier_model(**inputs) - pred_id = outputs.logits.argmax().item() - return self.classifier_model.config.id2label[pred_id] + + # 获取预测标签及其对应的概率 + logits = outputs.logits + probabilities = torch.softmax(logits, dim=1) + pred_id = logits.argmax().item() + confidence = probabilities[0, pred_id].item() # 获取预测标签的置信度 + + return self.classifier_model.config.id2label[pred_id], confidence except Exception as e: logger.error(f"短信分类失败: {str(e)}") raise + + def is_marketing_sms(self, text: str) -> bool: + """判断是否为营销/广告类短信,采用评分系统""" + # 特定字符串模式检查:直接匹配明显的营销/通知短信 + marketing_patterns = [ + # 百度类通知 + r"百度智能云.*?尊敬的用户", + r"百度.*?账户.*?tokens", + r"AppBuilder.*?账户", + r"账户有.*?免费额度", + r".*?免费额度.*?过期", + r"dwz\.cn\/[A-Za-z0-9]+" + ] + + # 对特定模式直接判断 + for pattern in marketing_patterns: + if re.search(pattern, text): + return True # 直接认为是营销短信 + + # 评分系统:根据短信内容特征进行评分,超过阈值判定为营销短信 + score = 0 + + # 强营销特征关键词(高权重) + strong_marketing_keywords = [ + "有奖", "免费赠送", "抽奖", "中奖", "优惠券", "折扣券", "特价", "秒杀", + "限时抢购", "促销", "推广", "广告", "代金券", "0元购", "tokens调用量" + ] + + # 一般营销特征关键词(中等权重) + general_marketing_keywords = [ + "活动", "优惠", "折扣", "限时", "抢购", "特价", "promotion", "推广", + "开业", "集点", "集赞", "关注", "公众号", "小程序", "注册有礼", "免费额度" + ] + + # 弱营销特征关键词(低权重,可能出现在正常短信中) + weak_marketing_keywords = [ + "尊敬的用户", "尊敬的客户", "您好", "注册", "登录", "账户", "账号", + "会员", "积分", "权益", "提醒", "即将", "有效期", "过期", "升级", + "更新", "下载", "APP", "应用", "平台", "网址", "点击", "工单" + ] + + # 短网址和链接(独立评估,结合其他特征判断) + url_patterns = [ + "dwz.cn", "t.cn", "短网址", "http://", "https://", "cmbt.cn" + ] + + # 业务短信特征(用于反向识别,降低误判率) + # 快递短信特征 + express_keywords = [ + "快递", "包裹", "取件码", "取件", "签收", "派送", "配送", "物流", + "驿站", "在途", "揽收", "暂存", "已到达", "丰巢", "柜取件", "柜机" + ] + + # 还款短信特征 + repayment_keywords = [ + "还款", "账单", "信用卡", "借款", "贷款", "逾期", "欠款", "最低还款", + "应还金额", "到期还款", "还清", "应还", "还款日", "账单¥", "账单¥", "查账还款" + ] + + # 收入短信特征 + income_keywords = [ + "收入", "转账", "入账", "到账", "支付", "工资", "报销", "余额", + "成功收款", "收到", "款项" + ] + + # 航班/火车票特征 + travel_keywords = [ + "航班", "航空", "飞机", "机票", "火车", "铁路", "列车", "车票", + "出发", "抵达", "起飞", "登机", "候车", "检票" + ] + + # 额外增加:通知类短信特征(通常不需要处理的短信) + notification_keywords = [ + "余额不足", "话费不足", "话费余额", "通讯费", "流量用尽", "流量不足", + "停机", "恢复通话", "自动充值", "交费", "缴费", + "消费提醒", "交易提醒", "动账", "短信通知", "验证码", "校验码", "安全码" + ] + + # 运营商标识 + telecom_keywords = [ + "中国电信", "中国移动", "中国联通", "电信", "移动", "联通", + "携号转网", "号码服务", "通讯服务", "189.cn", "10086", "10010" + ] + + # 银行和金融机构标识 + bank_keywords = [ + "信用卡", "储蓄卡", "借记卡", "储蓄", "银联", + "建设银行", "工商银行", "农业银行", "中国银行", "交通银行", + "招商银行", "浦发银行", "民生银行", "兴业银行", "广发银行", + "平安银行", "中信银行", "光大银行", "华夏银行", "邮储银行", + "农商银行", "支付宝", "微信支付", "京东金融", "度小满", "陆金所" + ] + + # 特殊情况检查:招商银行账单短信,不应被过滤 + if ("招商银行" in text and ("账单" in text or "还款日" in text)) or "cmbt.cn" in text: + if "还款" in text or "账单" in text or "消费卡" in text: + return False # 是还款短信,不过滤 + + # 计算评分 + # 首先检查业务短信特征,如果明确是业务短信,直接返回False + has_express_feature = any(keyword in text for keyword in express_keywords) + has_repayment_feature = any(keyword in text for keyword in repayment_keywords) + has_income_feature = any(keyword in text for keyword in income_keywords) + has_travel_feature = any(keyword in text for keyword in travel_keywords) + + # 检查是否为百度通知 + is_baidu_notification = "百度" in text and "尊敬的用户" in text + if is_baidu_notification: + return True # 百度通知应被过滤 + + # 如果短信中包含多个业务关键词(≥2个),很可能是重要的业务短信 + business_score = (has_express_feature + has_repayment_feature + + has_income_feature + has_travel_feature) + if business_score >= 2 and not is_baidu_notification: + return False # 多个业务特征同时存在,不太可能是营销短信 + + # 检查强营销特征 + for keyword in strong_marketing_keywords: + if keyword in text: + score += 3 + + # 检查一般营销特征 + for keyword in general_marketing_keywords: + if keyword in text: + score += 2 + + # 检查弱营销特征 + for keyword in weak_marketing_keywords: + if keyword in text: + score += 1 + + # 检查URL特征(结合是否存在业务特征) + has_url = any(pattern in text for pattern in url_patterns) + + # 降低业务特征短信的营销判定分数 + if has_express_feature and not is_baidu_notification: + score -= 3 # 快递特征明显减分 + + if has_repayment_feature: + score -= 3 # 还款特征明显减分 + + if has_income_feature: + score -= 2 # 收入特征减分 + + if has_travel_feature: + score -= 2 # 旅行特征减分 + + # 检查通知类短信特征(但不包括重要的业务短信) + if not has_express_feature and not has_repayment_feature: # 确保不是快递和还款短信 + notification_count = sum(1 for keyword in notification_keywords if keyword in text) + if notification_count >= 2: # 需要至少2个通知关键词才判定 + score += notification_count # 增加判定为营销/通知短信的可能性 + + # 检查运营商和银行标识(结合其他特征判断) + has_telecom_feature = any(keyword in text for keyword in telecom_keywords) + has_bank_feature = any(keyword in text for keyword in bank_keywords) + + # URL的评分处理 + if has_url: + if (has_express_feature or has_repayment_feature or has_income_feature or has_travel_feature) and not is_baidu_notification: + # URL在业务短信中可能是正常的追踪链接,不增加评分 + pass + else: + # 纯URL且无业务特征,可能是营销短信 + score += 2 + + # 特殊情况:运营商余额通知 + if has_telecom_feature and "余额" in text and not has_income_feature: + score += 2 + + # 设置判定阈值 + threshold = 4 # 需要至少4分才判定为营销短信 + + return score >= threshold + + def is_notification_sms(self, text: str) -> bool: + """判断是否为通知类短信(如银行交易通知、运营商提醒等)""" + # 银行交易通知特征(不包括还款提醒) + bank_transaction_patterns = [ + r"您尾号\d+的.+消费", + r"您.+账户消费[\d,.]+元", + r"交易[\d,.]+元", + r"支付宝.+消费", + r"微信支付.+消费", + r"\d{1,2}月\d{1,2}日\d{1,2}[::]\d{1,2}消费", + r"银行卡([支付|消费|扣款])" + ] + + # 排除规则:包含以下关键词的短信不应被判定为通知短信 + business_keywords = [ + # 还款关键词 + "还款", "账单", "应还", "到期还款", "还款日", "最低还款", "账单¥", "账单¥", "查账还款", + # 快递关键词 + "快递", "包裹", "取件码", "取件", "签收", "派送", "配送", + # 收入关键词 + "收入", "转账", "入账", "到账", "支付成功", "工资" + ] + + # 运营商余额通知特征 + telecom_balance_patterns = [ + r"余额[不足|低于][\d,.]+元", + r"话费[不足|仅剩][\d,.]+元", + r"流量[不足|即将用尽]", + r"[电信|移动|联通].+余额", + r"[停机|停号]提醒", + r"为了保障您的正常通讯", + ] + + # 首先检查是否包含业务关键词,有则不应判定为通知短信 + for keyword in business_keywords: + if keyword in text: + return False # 包含业务关键词,不是需要过滤的通知短信 + + # 检查银行交易通知模式 + for pattern in bank_transaction_patterns: + if re.search(pattern, text): + logger.debug(f"识别到银行交易通知短信:{text[:30]}...") + return True + + # 检查运营商余额通知模式 + for pattern in telecom_balance_patterns: + if re.search(pattern, text): + logger.debug(f"识别到运营商余额通知短信:{text[:30]}...") + return True + + return False def extract_entities(self, text: str) -> Dict[str, Optional[str]]: """提取文本中的实体""" @@ -382,67 +615,60 @@ if not amount_text: return None + # 尝试直接在上下文中使用正则表达式查找更完整的金额 + # 如果在同一句话里有类似"应还金额5,800元"这样的模式 + amount_match = re.search(r'(?:应还|还款)?金额([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', context) + if amount_match: + return amount_match.group(1) # 直接返回匹配到的金额,保留原始格式 + + # 尝试查找最低还款金额 + min_amount_match = re.search(r'最低还款([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', context) + if min_amount_match and "MIN_CODE" in current_entity["type"]: + return min_amount_match.group(1) # 直接返回匹配到的最低还款金额,保留原始格式 + # 在上下文中查找完整金额 amount_index = context.find(amount_text) if amount_index != -1: # 扩大搜索范围,查找完整金额 - search_start = max(0, amount_index - 10) # 增加向前搜索范围 + search_start = max(0, amount_index - 10) search_end = min(len(context), amount_index + len(amount_text) + 10) search_text = context[search_start:search_end] - # 使用更精确的正则表达式查找金额模式 - amount_pattern = re.compile(r'(\d{1,10}(?:\.\d{1,2})?)') + # 使用正则表达式查找金额 + amount_pattern = re.compile(r'([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?') matches = list(amount_pattern.finditer(search_text)) - # 找到最接近且最长的完整金额 - best_match = None - min_distance = float('inf') - max_length = 0 - target_pos = amount_index - search_start - - for match in matches: - match_pos = match.start() - distance = abs(match_pos - target_pos) - match_text = match.group(1) + if matches: + # 选择最接近的匹配结果 + best_match = None + min_distance = float('inf') - # 优先选择更长的匹配,除非距离差异太大 - if len(match_text) > max_length or (len(match_text) == max_length and distance < min_distance): - try: - # 验证金额是否合理 - value = float(match_text) - if value > 0 and value <= 9999999.99: # 设置合理的金额范围 - best_match = match_text - min_distance = distance - max_length = len(match_text) - except ValueError: - continue + for match in matches: + distance = abs(match.start() - (amount_index - search_start)) + if distance < min_distance: + min_distance = distance + best_match = match.group(1) # 只取数字部分,保留逗号 + + if best_match: + return best_match - if best_match: - amount_text = best_match - + # 如果上述方法都没找到,则保留原始提取结果但验证其有效性 # 移除货币符号和无效词 for symbol in RepaymentNERConfig.AMOUNT_CONFIG['currency_symbols']: amount_text = amount_text.replace(symbol, '') for word in RepaymentNERConfig.AMOUNT_CONFIG['invalid_words']: amount_text = amount_text.replace(word, '') - # 处理金额中的逗号 - amount_text = amount_text.replace(',', '') - + # 验证金额有效性 + clean_amount = amount_text.replace(',', '') try: - # 转换为浮点数 - value = float(amount_text) - - # 验证整数位数 - integer_part = str(int(value)) - if len(integer_part) <= RepaymentNERConfig.AMOUNT_CONFIG['max_integer_digits']: - # 保持原始小数位数 - if '.' in amount_text: - decimal_places = len(amount_text.split('.')[1]) - return f"{value:.{decimal_places}f}" - return str(int(value)) + value = float(clean_amount) + if value > 0: + # 返回原始格式 + return amount_text except ValueError: pass + return None # 实体提取 @@ -473,12 +699,11 @@ # 处理银行名称 if entities["BANK"]: - # 修改银行名称处理逻辑 bank_parts = [] - seen = set() # 用于去重 + seen = set() for bank in entities["BANK"]: bank = bank.strip() - if bank and bank not in seen: # 避免重复 + if bank and bank not in seen: bank_parts.append(bank) seen.add(bank) bank = "".join(bank_parts) @@ -487,7 +712,14 @@ # 处理还款类型 if entities["TYPE"]: - type_ = "".join(entities["TYPE"]).strip() + type_parts = [] + seen = set() + for type_ in entities["TYPE"]: + type_ = type_.strip() + if type_ and type_ not in seen: + type_parts.append(type_) + seen.add(type_) + type_ = "".join(type_parts) if len(type_) <= RepaymentNERConfig.MAX_ENTITY_LENGTH["TYPE"]: result["type"] = type_ @@ -500,32 +732,85 @@ # 处理日期 if entities["DATE"]: date = "".join(entities["DATE"]) - date = ''.join(c for c in date if c.isdigit() or c in ['年', '月', '日']) + date = ''.join(c for c in date if c.isdigit() or c in ['年', '月', '日', '-']) if date: result["date"] = date # 处理金额 - amount_candidates = [] - for amount in entities["PICKUP_CODE"]: - cleaned_amount = clean_amount(amount, text) - if cleaned_amount: - try: - value = float(cleaned_amount) - amount_candidates.append((cleaned_amount, value)) - except ValueError: - continue + # 尝试匹配带¥符号的账单金额模式 + amount_match = re.search(r'账单¥([\d,]+\.?\d*)', text) + if not amount_match: + # 尝试匹配带¥符号的账单金额模式 + amount_match = re.search(r'账单¥([\d,]+\.?\d*)', text) + if not amount_match: + # 尝试匹配一般金额模式 + amount_match = re.search(r'(?:应还|还款)?金额([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', text) - # 选择最大的有效金额 - if amount_candidates: - # 按金额大小排序,选择最大的 - result["amount"] = max(amount_candidates, key=lambda x: x[1])[0] + if amount_match: + amount = amount_match.group(1) # 保留原始格式(带逗号) + # 验证金额有效性 + try: + value = float(amount.replace(',', '')) + if value > 0: + result["amount"] = amount + except ValueError: + pass + + # 如果正则没有匹配到,使用NER结果 + if not result["amount"]: + amount_candidates = [] + # 从识别的实体中获取 + for amount in entities["PICKUP_CODE"]: + cleaned_amount = clean_amount(amount, text) + if cleaned_amount: + try: + value = float(cleaned_amount.replace(',', '')) + amount_candidates.append((cleaned_amount, value)) + except ValueError: + continue + + # 如果还是没有找到,尝试从文本中提取 + if not amount_candidates: + # 使用多个正则表达式匹配不同格式的金额 + # 1. 匹配带¥符号格式 + matches = list(re.finditer(r'¥([\d,]+\.?\d*)', text)) + # 2. 匹配带¥符号格式 + matches.extend(list(re.finditer(r'¥([\d,]+\.?\d*)', text))) + # 3. 匹配一般金额格式 + matches.extend(list(re.finditer(r'([\d,]+\.?\d*)(?:元|块钱|块|万元|万)', text))) + + for match in matches: + amount_text = match.group(1) # 获取数字部分,保留逗号 + try: + value = float(amount_text.replace(',', '')) + amount_candidates.append((amount_text, value)) + except ValueError: + continue + + # 选择最大的有效金额 + if amount_candidates: + result["amount"] = max(amount_candidates, key=lambda x: x[1])[0] # 处理最低还款金额 - for amount in entities["MIN_CODE"]: - cleaned_amount = clean_amount(amount, text) # 传入原始文本作为上下文 - if cleaned_amount: - result["min_amount"] = cleaned_amount - break + # 先尝试使用正则表达式直接匹配最低还款金额 + min_amount_match = re.search(r'最低还款([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', text) + if min_amount_match: + min_amount = min_amount_match.group(1) # 保留原始格式(带逗号) + # 验证金额有效性 + try: + value = float(min_amount.replace(',', '')) + if value > 0: + result["min_amount"] = min_amount + except ValueError: + pass + + # 如果正则没有匹配到,使用NER结果 + if not result["min_amount"] and entities["MIN_CODE"]: + for amount in entities["MIN_CODE"]: + cleaned_amount = clean_amount(amount, text) + if cleaned_amount: + result["min_amount"] = cleaned_amount + break return result @@ -571,9 +856,8 @@ search_end = min(len(context), amount_index + len(amount_text) + 10) search_text = context[search_start:search_end] - # 使用正则表达式查找金额模式 - import re - amount_pattern = re.compile(r'(\d{1,10}(?:\.\d{1,2})?)') + # 使用更精确的正则表达式查找金额模式,支持带逗号的金额 + amount_pattern = re.compile(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)') matches = list(amount_pattern.finditer(search_text)) # 找到最接近且最长的完整金额 @@ -671,15 +955,116 @@ result["datetime"] = datetime # 处理收入金额 - if entities["PICKUP_CODE"]: + # 先尝试使用正则表达式直接匹配收入金额,包括"收入金额"格式 + amount_match = re.search(r'收入金额([\d,]+\.?\d*)元', text) + if not amount_match: + # 尝试匹配一般收入格式 + amount_match = re.search(r'收入([\d,]+\.?\d*)元', text) + + if amount_match: + amount = amount_match.group(1) # 保留原始格式(带逗号) + # 验证金额有效性 + try: + value = float(amount.replace(',', '')) + if value > 0: + result["amount"] = amount + except ValueError: + pass + + # 如果正则没有匹配到,继续尝试NER结果 + if not result["amount"]: + amount_candidates = [] + # 首先从识别的实体中获取 for amount in entities["PICKUP_CODE"]: cleaned_amount = clean_amount(amount, text) if cleaned_amount: - result["amount"] = cleaned_amount - break + try: + value = float(cleaned_amount) + amount_candidates.append((cleaned_amount, value)) + except ValueError: + continue + + # 如果没有找到有效金额,直接从文本中尝试提取 + if not amount_candidates: + # 尝试多种模式匹配金额 + # 1. 匹配"收入金额xxx元"模式 + matches = list(re.finditer(r'收入金额([\d,]+\.?\d*)元', text)) + # 2. 匹配"收入xxx元"模式 + matches.extend(list(re.finditer(r'收入([\d,]+\.?\d*)元', text))) + # 3. 匹配带元结尾的金额 + matches.extend(list(re.finditer(r'([0-9,]+\.[0-9]+)元', text))) + # 4. 匹配普通数字(可能是余额),但排除已识别为余额的金额 + if "余额" in text: + balance_match = re.search(r'余额([\d,]+\.?\d*)元', text) + if balance_match: + balance_value = balance_match.group(1) + # 只匹配不等于余额的金额 + all_numbers = re.finditer(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)', text) + for match in all_numbers: + if match.group(1) != balance_value: + matches.append(match) + else: + matches.extend(list(re.finditer(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)', text))) + + for match in matches: + amount_text = match.group(1) + try: + value = float(amount_text.replace(',', '')) + amount_candidates.append((amount_text, value)) + except ValueError: + continue + + # 从金额候选中排除已识别的余额值 + if result["balance"]: + try: + balance_value = float(result["balance"].replace(',', '')) + amount_candidates = [(text, value) for text, value in amount_candidates if abs(value - balance_value) > 0.01] + except ValueError: + pass + + # 选择适当的金额作为收入 + if amount_candidates: + has_income_amount_keyword = "收入金额" in text + + if has_income_amount_keyword: + # 查找"收入金额"附近的数字 + idx = text.find("收入金额") + if idx != -1: + closest_amount = None + min_distance = float('inf') + for amount_text, value in amount_candidates: + # 找到这个数字在原文中的位置 + amount_idx = text.find(amount_text) + if amount_idx != -1: + distance = abs(amount_idx - idx) + if distance < min_distance: + min_distance = distance + closest_amount = amount_text + + if closest_amount: + result["amount"] = closest_amount + else: + # 如果无法找到最近的金额,使用最大金额策略 + result["amount"] = max(amount_candidates, key=lambda x: x[1])[0] + else: + # 如果没有"收入金额"关键词,则使用最大金额策略 + result["amount"] = max(amount_candidates, key=lambda x: x[1])[0] # 处理余额 - if entities["BALANCE"]: + # 先尝试使用正则表达式直接匹配余额 + balance_match = re.search(r'余额([\d,]+\.?\d*)元', text) + if balance_match: + balance = balance_match.group(1) # 保留原始格式(带逗号) + # 验证金额有效性 + try: + value = float(balance.replace(',', '')) + if value > 0: + result["balance"] = balance + except ValueError: + pass + + # 如果正则没有匹配到,使用NER结果 + if not result["balance"] and entities["BALANCE"]: for amount in entities["BALANCE"]: cleaned_amount = clean_amount(amount, text) if cleaned_amount: @@ -912,6 +1297,45 @@ app = Flask(__name__) model_manager = ModelManager() +# 添加保存短信到文件的函数 +def save_sms_to_file(text: str, category: str = None, confidence: float = None) -> bool: + """ + 将短信内容保存到本地文件 + + Args: + text: 短信内容 + category: 分类结果 + confidence: 分类置信度 + + Returns: + bool: 保存成功返回True,否则返回False + """ + try: + # 确保日志目录存在 + log_dir = "./sms_logs" + if not os.path.exists(log_dir): + os.makedirs(log_dir) + + # 创建基于日期的文件名 + today = datetime.datetime.now().strftime("%Y-%m-%d") + file_path = os.path.join(log_dir, f"sms_log_{today}.txt") + + # 获取当前时间 + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # 准备要写入的内容 + category_info = f"分类: {category}, 置信度: {confidence:.4f}" if category and confidence else "未分类" + log_content = f"[{current_time}] {category_info}\n{text}\n{'='*50}\n" + + # 以追加模式写入文件 + with open(file_path, 'a', encoding='utf-8') as f: + f.write(log_content) + + return True + except Exception as e: + logger.error(f"保存短信到文件失败: {str(e)}") + return False + @app.route("/health", methods=["GET"]) def health_check(): """健康检查接口""" @@ -932,9 +1356,123 @@ text = data["content"] if not isinstance(text, str) or not text.strip(): raise BadRequest("短信内容不能为空") - + + # 保存原始短信内容到文件 + save_sms_to_file(text) + + # 特定短信识别逻辑 - 针对百度通知和招商银行账单 + # 识别百度通知 + if "百度智能云" in text and "尊敬的用户" in text and "免费额度" in text: + logger.info(f"直接识别为百度通知短信: {text[:30]}...") + category = "其他" + save_sms_to_file(text, category, 1.0) # 记录分类结果 + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": {} + } + }) + + # 识别招商银行账单 + if "招商银行" in text and ("账单¥" in text or "账单¥" in text or "还款日" in text): + logger.info(f"直接识别为招商银行还款短信: {text[:30]}...") + category = "还款" + details = model_manager.extract_repayment_entities(text) + save_sms_to_file(text, category, 1.0) # 记录分类结果 + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": details + } + }) + # 处理短信 - category = model_manager.classify_sms(text) + category, confidence = model_manager.classify_sms(text) + + # 保存短信内容和分类结果 + save_sms_to_file(text, category, confidence) + + # 如果是明确的业务短信类别,直接进入处理流程 + if category in ["快递", "还款", "收入", "航班", "火车票"] and confidence > 0.5: + # 对百度通知的特殊处理 + if category == "快递" and "百度" in text and "尊敬的用户" in text: + logger.info(f"纠正百度通知短信的分类: {text[:30]}...") + category = "其他" + save_sms_to_file(text, category, confidence) # 更新分类结果 + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": {} + } + }) + + # 对于高置信度的业务分类,直接进入实体提取 + if category == "快递": + details = model_manager.extract_entities(text) + elif category == "还款": + details = model_manager.extract_repayment_entities(text) + elif category == "收入": + details = model_manager.extract_income_entities(text) + elif category == "航班": + details = model_manager.extract_flight_entities(text) + elif category == "火车票": + details = model_manager.extract_train_entities(text) + + logger.info(f"高置信度业务短信: {text[:30]}..., category: {category}, confidence: {confidence:.4f}") + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": details + } + }) + + # 检查是否为营销/广告短信 + if model_manager.is_marketing_sms(text): + # 如果是营销/广告短信,直接归类为"其他" + logger.info(f"检测到营销/广告短信: {text[:30]}...") + category = "其他" + save_sms_to_file(text, category, confidence) # 更新分类结果 + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": {} + } + }) + + # 检查是否为通知类短信 + if model_manager.is_notification_sms(text): + # 如果是通知类短信,直接归类为"其他" + logger.info(f"检测到通知类短信: {text[:30]}...") + category = "其他" + save_sms_to_file(text, category, confidence) # 更新分类结果 + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": {} + } + }) + + # 置信度阈值,低于此阈值的分类结果被视为"其他" + confidence_threshold = 0.7 + if confidence < confidence_threshold: + logger.info(f"短信分类置信度低({confidence:.4f}),归类为'其他': {text[:30]}...") + category = "其他" + save_sms_to_file(text, category, confidence) # 更新分类结果 + return jsonify({ + "status": "success", + "data": { + "category": category, + "details": {} + } + }) + + # 根据分类结果调用对应的实体提取函数 if category == "快递": details = model_manager.extract_entities(text) elif category == "还款": @@ -943,13 +1481,13 @@ details = model_manager.extract_income_entities(text) elif category == "航班": details = model_manager.extract_flight_entities(text) - elif category == "火车票": # 添加火车票类别处理 + elif category == "火车票": details = model_manager.extract_train_entities(text) else: details = {} # 记录处理结果 - logger.info(f"Successfully processed SMS: {text[:30]}...") + logger.info(f"Successfully processed SMS: {text[:30]}..., category: {category}, confidence: {confidence:.4f}") return jsonify({ "status": "success", @@ -958,7 +1496,7 @@ "details": details } }) - + save_sms_to_file except BadRequest as e: logger.warning(f"Invalid request: {str(e)}") return jsonify({ -- Gitblit v1.9.3