| | |
| | | # -*- coding: utf-8 -*- |
| | | import os |
| | | import logging |
| | | import datetime |
| | | from typing import Dict, Optional, Tuple |
| | | |
| | | from flask import Flask, request, jsonify |
| | |
| | | logger.error(f"加载火车票模型失败: {str(e)}") |
| | | raise |
| | | |
| | | def classify_sms(self, text: str) -> str: |
| | | """对短信进行分类""" |
| | | def classify_sms(self, text: str) -> Tuple[str, float]: |
| | | """对短信进行分类,并返回置信度""" |
| | | try: |
| | | inputs = self.classifier_tokenizer( |
| | | text, |
| | |
| | | ) |
| | | with torch.no_grad(): |
| | | outputs = self.classifier_model(**inputs) |
| | | pred_id = outputs.logits.argmax().item() |
| | | return self.classifier_model.config.id2label[pred_id] |
| | | |
| | | # 获取预测标签及其对应的概率 |
| | | logits = outputs.logits |
| | | probabilities = torch.softmax(logits, dim=1) |
| | | pred_id = logits.argmax().item() |
| | | confidence = probabilities[0, pred_id].item() # 获取预测标签的置信度 |
| | | |
| | | return self.classifier_model.config.id2label[pred_id], confidence |
| | | except Exception as e: |
| | | logger.error(f"短信分类失败: {str(e)}") |
| | | raise |
| | | |
| | | def is_marketing_sms(self, text: str) -> bool: |
| | | """判断是否为营销/广告类短信,采用评分系统""" |
| | | # 特定字符串模式检查:直接匹配明显的营销/通知短信 |
| | | marketing_patterns = [ |
| | | # 百度类通知 |
| | | r"百度智能云.*?尊敬的用户", |
| | | r"百度.*?账户.*?tokens", |
| | | r"AppBuilder.*?账户", |
| | | r"账户有.*?免费额度", |
| | | r".*?免费额度.*?过期", |
| | | r"dwz\.cn\/[A-Za-z0-9]+" |
| | | ] |
| | | |
| | | # 对特定模式直接判断 |
| | | for pattern in marketing_patterns: |
| | | if re.search(pattern, text): |
| | | return True # 直接认为是营销短信 |
| | | |
| | | # 评分系统:根据短信内容特征进行评分,超过阈值判定为营销短信 |
| | | score = 0 |
| | | |
| | | # 强营销特征关键词(高权重) |
| | | strong_marketing_keywords = [ |
| | | "有奖", "免费赠送", "抽奖", "中奖", "优惠券", "折扣券", "特价", "秒杀", |
| | | "限时抢购", "促销", "推广", "广告", "代金券", "0元购", "tokens调用量" |
| | | ] |
| | | |
| | | # 一般营销特征关键词(中等权重) |
| | | general_marketing_keywords = [ |
| | | "活动", "优惠", "折扣", "限时", "抢购", "特价", "promotion", "推广", |
| | | "开业", "集点", "集赞", "关注", "公众号", "小程序", "注册有礼", "免费额度" |
| | | ] |
| | | |
| | | # 弱营销特征关键词(低权重,可能出现在正常短信中) |
| | | weak_marketing_keywords = [ |
| | | "尊敬的用户", "尊敬的客户", "您好", "注册", "登录", "账户", "账号", |
| | | "会员", "积分", "权益", "提醒", "即将", "有效期", "过期", "升级", |
| | | "更新", "下载", "APP", "应用", "平台", "网址", "点击", "工单" |
| | | ] |
| | | |
| | | # 短网址和链接(独立评估,结合其他特征判断) |
| | | url_patterns = [ |
| | | "dwz.cn", "t.cn", "短网址", "http://", "https://", "cmbt.cn" |
| | | ] |
| | | |
| | | # 业务短信特征(用于反向识别,降低误判率) |
| | | # 快递短信特征 |
| | | express_keywords = [ |
| | | "快递", "包裹", "取件码", "取件", "签收", "派送", "配送", "物流", |
| | | "驿站", "在途", "揽收", "暂存", "已到达", "丰巢", "柜取件", "柜机" |
| | | ] |
| | | |
| | | # 还款短信特征 |
| | | repayment_keywords = [ |
| | | "还款", "账单", "信用卡", "借款", "贷款", "逾期", "欠款", "最低还款", |
| | | "应还金额", "到期还款", "还清", "应还", "还款日", "账单¥", "账单¥", "查账还款" |
| | | ] |
| | | |
| | | # 收入短信特征 |
| | | income_keywords = [ |
| | | "收入", "转账", "入账", "到账", "支付", "工资", "报销", "余额", |
| | | "成功收款", "收到", "款项" |
| | | ] |
| | | |
| | | # 航班/火车票特征 |
| | | travel_keywords = [ |
| | | "航班", "航空", "飞机", "机票", "火车", "铁路", "列车", "车票", |
| | | "出发", "抵达", "起飞", "登机", "候车", "检票" |
| | | ] |
| | | |
| | | # 额外增加:通知类短信特征(通常不需要处理的短信) |
| | | notification_keywords = [ |
| | | "余额不足", "话费不足", "话费余额", "通讯费", "流量用尽", "流量不足", |
| | | "停机", "恢复通话", "自动充值", "交费", "缴费", |
| | | "消费提醒", "交易提醒", "动账", "短信通知", "验证码", "校验码", "安全码" |
| | | ] |
| | | |
| | | # 运营商标识 |
| | | telecom_keywords = [ |
| | | "中国电信", "中国移动", "中国联通", "电信", "移动", "联通", |
| | | "携号转网", "号码服务", "通讯服务", "189.cn", "10086", "10010" |
| | | ] |
| | | |
| | | # 银行和金融机构标识 |
| | | bank_keywords = [ |
| | | "信用卡", "储蓄卡", "借记卡", "储蓄", "银联", |
| | | "建设银行", "工商银行", "农业银行", "中国银行", "交通银行", |
| | | "招商银行", "浦发银行", "民生银行", "兴业银行", "广发银行", |
| | | "平安银行", "中信银行", "光大银行", "华夏银行", "邮储银行", |
| | | "农商银行", "支付宝", "微信支付", "京东金融", "度小满", "陆金所" |
| | | ] |
| | | |
| | | # 特殊情况检查:招商银行账单短信,不应被过滤 |
| | | if ("招商银行" in text and ("账单" in text or "还款日" in text)) or "cmbt.cn" in text: |
| | | if "还款" in text or "账单" in text or "消费卡" in text: |
| | | return False # 是还款短信,不过滤 |
| | | |
| | | # 计算评分 |
| | | # 首先检查业务短信特征,如果明确是业务短信,直接返回False |
| | | has_express_feature = any(keyword in text for keyword in express_keywords) |
| | | has_repayment_feature = any(keyword in text for keyword in repayment_keywords) |
| | | has_income_feature = any(keyword in text for keyword in income_keywords) |
| | | has_travel_feature = any(keyword in text for keyword in travel_keywords) |
| | | |
| | | # 检查是否为百度通知 |
| | | is_baidu_notification = "百度" in text and "尊敬的用户" in text |
| | | if is_baidu_notification: |
| | | return True # 百度通知应被过滤 |
| | | |
| | | # 如果短信中包含多个业务关键词(≥2个),很可能是重要的业务短信 |
| | | business_score = (has_express_feature + has_repayment_feature + |
| | | has_income_feature + has_travel_feature) |
| | | if business_score >= 2 and not is_baidu_notification: |
| | | return False # 多个业务特征同时存在,不太可能是营销短信 |
| | | |
| | | # 检查强营销特征 |
| | | for keyword in strong_marketing_keywords: |
| | | if keyword in text: |
| | | score += 3 |
| | | |
| | | # 检查一般营销特征 |
| | | for keyword in general_marketing_keywords: |
| | | if keyword in text: |
| | | score += 2 |
| | | |
| | | # 检查弱营销特征 |
| | | for keyword in weak_marketing_keywords: |
| | | if keyword in text: |
| | | score += 1 |
| | | |
| | | # 检查URL特征(结合是否存在业务特征) |
| | | has_url = any(pattern in text for pattern in url_patterns) |
| | | |
| | | # 降低业务特征短信的营销判定分数 |
| | | if has_express_feature and not is_baidu_notification: |
| | | score -= 3 # 快递特征明显减分 |
| | | |
| | | if has_repayment_feature: |
| | | score -= 3 # 还款特征明显减分 |
| | | |
| | | if has_income_feature: |
| | | score -= 2 # 收入特征减分 |
| | | |
| | | if has_travel_feature: |
| | | score -= 2 # 旅行特征减分 |
| | | |
| | | # 检查通知类短信特征(但不包括重要的业务短信) |
| | | if not has_express_feature and not has_repayment_feature: # 确保不是快递和还款短信 |
| | | notification_count = sum(1 for keyword in notification_keywords if keyword in text) |
| | | if notification_count >= 2: # 需要至少2个通知关键词才判定 |
| | | score += notification_count # 增加判定为营销/通知短信的可能性 |
| | | |
| | | # 检查运营商和银行标识(结合其他特征判断) |
| | | has_telecom_feature = any(keyword in text for keyword in telecom_keywords) |
| | | has_bank_feature = any(keyword in text for keyword in bank_keywords) |
| | | |
| | | # URL的评分处理 |
| | | if has_url: |
| | | if (has_express_feature or has_repayment_feature or has_income_feature or has_travel_feature) and not is_baidu_notification: |
| | | # URL在业务短信中可能是正常的追踪链接,不增加评分 |
| | | pass |
| | | else: |
| | | # 纯URL且无业务特征,可能是营销短信 |
| | | score += 2 |
| | | |
| | | # 特殊情况:运营商余额通知 |
| | | if has_telecom_feature and "余额" in text and not has_income_feature: |
| | | score += 2 |
| | | |
| | | # 设置判定阈值 |
| | | threshold = 4 # 需要至少4分才判定为营销短信 |
| | | |
| | | return score >= threshold |
| | | |
| | | def is_notification_sms(self, text: str) -> bool: |
| | | """判断是否为通知类短信(如银行交易通知、运营商提醒等)""" |
| | | # 银行交易通知特征(不包括还款提醒) |
| | | bank_transaction_patterns = [ |
| | | r"您尾号\d+的.+消费", |
| | | r"您.+账户消费[\d,.]+元", |
| | | r"交易[\d,.]+元", |
| | | r"支付宝.+消费", |
| | | r"微信支付.+消费", |
| | | r"\d{1,2}月\d{1,2}日\d{1,2}[::]\d{1,2}消费", |
| | | r"银行卡([支付|消费|扣款])" |
| | | ] |
| | | |
| | | # 排除规则:包含以下关键词的短信不应被判定为通知短信 |
| | | business_keywords = [ |
| | | # 还款关键词 |
| | | "还款", "账单", "应还", "到期还款", "还款日", "最低还款", "账单¥", "账单¥", "查账还款", |
| | | # 快递关键词 |
| | | "快递", "包裹", "取件码", "取件", "签收", "派送", "配送", |
| | | # 收入关键词 |
| | | "收入", "转账", "入账", "到账", "支付成功", "工资" |
| | | ] |
| | | |
| | | # 运营商余额通知特征 |
| | | telecom_balance_patterns = [ |
| | | r"余额[不足|低于][\d,.]+元", |
| | | r"话费[不足|仅剩][\d,.]+元", |
| | | r"流量[不足|即将用尽]", |
| | | r"[电信|移动|联通].+余额", |
| | | r"[停机|停号]提醒", |
| | | r"为了保障您的正常通讯", |
| | | ] |
| | | |
| | | # 首先检查是否包含业务关键词,有则不应判定为通知短信 |
| | | for keyword in business_keywords: |
| | | if keyword in text: |
| | | return False # 包含业务关键词,不是需要过滤的通知短信 |
| | | |
| | | # 检查银行交易通知模式 |
| | | for pattern in bank_transaction_patterns: |
| | | if re.search(pattern, text): |
| | | logger.debug(f"识别到银行交易通知短信:{text[:30]}...") |
| | | return True |
| | | |
| | | # 检查运营商余额通知模式 |
| | | for pattern in telecom_balance_patterns: |
| | | if re.search(pattern, text): |
| | | logger.debug(f"识别到运营商余额通知短信:{text[:30]}...") |
| | | return True |
| | | |
| | | return False |
| | | |
| | | def extract_entities(self, text: str) -> Dict[str, Optional[str]]: |
| | | """提取文本中的实体""" |
| | |
| | | result["date"] = date |
| | | |
| | | # 处理金额 |
| | | # 先尝试使用正则表达式直接匹配金额 |
| | | # 尝试匹配带¥符号的账单金额模式 |
| | | amount_match = re.search(r'账单¥([\d,]+\.?\d*)', text) |
| | | if not amount_match: |
| | | # 尝试匹配带¥符号的账单金额模式 |
| | | amount_match = re.search(r'账单¥([\d,]+\.?\d*)', text) |
| | | if not amount_match: |
| | | # 尝试匹配一般金额模式 |
| | | amount_match = re.search(r'(?:应还|还款)?金额([\d,]+\.?\d*)(?:元|块钱|块|万元|万)?', text) |
| | | |
| | | if amount_match: |
| | | amount = amount_match.group(1) # 保留原始格式(带逗号) |
| | | # 验证金额有效性 |
| | |
| | | |
| | | # 如果还是没有找到,尝试从文本中提取 |
| | | if not amount_candidates: |
| | | # 使用更宽松的正则表达式匹配金额 |
| | | amount_pattern = re.compile(r'([\d,]+\.?\d*)(?:元|块钱|块|万元|万)') |
| | | matches = list(amount_pattern.finditer(text)) |
| | | # 使用多个正则表达式匹配不同格式的金额 |
| | | # 1. 匹配带¥符号格式 |
| | | matches = list(re.finditer(r'¥([\d,]+\.?\d*)', text)) |
| | | # 2. 匹配带¥符号格式 |
| | | matches.extend(list(re.finditer(r'¥([\d,]+\.?\d*)', text))) |
| | | # 3. 匹配一般金额格式 |
| | | matches.extend(list(re.finditer(r'([\d,]+\.?\d*)(?:元|块钱|块|万元|万)', text))) |
| | | |
| | | for match in matches: |
| | | amount_text = match.group(1) # 获取数字部分,保留逗号 |
| | |
| | | result["datetime"] = datetime |
| | | |
| | | # 处理收入金额 |
| | | # 先尝试使用正则表达式直接匹配收入金额,包括"收入金额"格式 |
| | | amount_match = re.search(r'收入金额([\d,]+\.?\d*)元', text) |
| | | if not amount_match: |
| | | # 尝试匹配一般收入格式 |
| | | amount_match = re.search(r'收入([\d,]+\.?\d*)元', text) |
| | | |
| | | if amount_match: |
| | | amount = amount_match.group(1) # 保留原始格式(带逗号) |
| | | # 验证金额有效性 |
| | | try: |
| | | value = float(amount.replace(',', '')) |
| | | if value > 0: |
| | | result["amount"] = amount |
| | | except ValueError: |
| | | pass |
| | | |
| | | # 如果正则没有匹配到,继续尝试NER结果 |
| | | if not result["amount"]: |
| | | amount_candidates = [] |
| | | # 首先从识别的实体中获取 |
| | | for amount in entities["PICKUP_CODE"]: |
| | |
| | | |
| | | # 如果没有找到有效金额,直接从文本中尝试提取 |
| | | if not amount_candidates: |
| | | # 直接在整个文本中寻找金额模式 |
| | | amount_pattern = re.compile(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)') |
| | | matches = list(amount_pattern.finditer(text)) |
| | | # 尝试多种模式匹配金额 |
| | | # 1. 匹配"收入金额xxx元"模式 |
| | | matches = list(re.finditer(r'收入金额([\d,]+\.?\d*)元', text)) |
| | | # 2. 匹配"收入xxx元"模式 |
| | | matches.extend(list(re.finditer(r'收入([\d,]+\.?\d*)元', text))) |
| | | # 3. 匹配带元结尾的金额 |
| | | matches.extend(list(re.finditer(r'([0-9,]+\.[0-9]+)元', text))) |
| | | # 4. 匹配普通数字(可能是余额),但排除已识别为余额的金额 |
| | | if "余额" in text: |
| | | balance_match = re.search(r'余额([\d,]+\.?\d*)元', text) |
| | | if balance_match: |
| | | balance_value = balance_match.group(1) |
| | | # 只匹配不等于余额的金额 |
| | | all_numbers = re.finditer(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)', text) |
| | | for match in all_numbers: |
| | | if match.group(1) != balance_value: |
| | | matches.append(match) |
| | | else: |
| | | matches.extend(list(re.finditer(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)', text))) |
| | | |
| | | for match in matches: |
| | | amount_text = match.group(1) |
| | |
| | | except ValueError: |
| | | continue |
| | | |
| | | # 选择最合适的有效金额 |
| | | # 从金额候选中排除已识别的余额值 |
| | | if result["balance"]: |
| | | try: |
| | | balance_value = float(result["balance"].replace(',', '')) |
| | | amount_candidates = [(text, value) for text, value in amount_candidates if abs(value - balance_value) > 0.01] |
| | | except ValueError: |
| | | pass |
| | | |
| | | # 选择适当的金额作为收入 |
| | | if amount_candidates: |
| | | has_income_amount_keyword = "收入金额" in text |
| | | |
| | | if has_income_amount_keyword: |
| | | # 查找"收入金额"附近的数字 |
| | | idx = text.find("收入金额") |
| | | if idx != -1: |
| | | closest_amount = None |
| | | min_distance = float('inf') |
| | | for amount_text, value in amount_candidates: |
| | | # 找到这个数字在原文中的位置 |
| | | amount_idx = text.find(amount_text) |
| | | if amount_idx != -1: |
| | | distance = abs(amount_idx - idx) |
| | | if distance < min_distance: |
| | | min_distance = distance |
| | | closest_amount = amount_text |
| | | |
| | | if closest_amount: |
| | | result["amount"] = closest_amount |
| | | else: |
| | | # 如果无法找到最近的金额,使用最大金额策略 |
| | | result["amount"] = max(amount_candidates, key=lambda x: x[1])[0] |
| | | else: |
| | | # 如果没有"收入金额"关键词,则使用最大金额策略 |
| | | result["amount"] = max(amount_candidates, key=lambda x: x[1])[0] |
| | | |
| | | # 处理余额 |
| | | if entities["BALANCE"]: |
| | | # 先尝试使用正则表达式直接匹配余额 |
| | | balance_match = re.search(r'余额([\d,]+\.?\d*)元', text) |
| | | if balance_match: |
| | | balance = balance_match.group(1) # 保留原始格式(带逗号) |
| | | # 验证金额有效性 |
| | | try: |
| | | value = float(balance.replace(',', '')) |
| | | if value > 0: |
| | | result["balance"] = balance |
| | | except ValueError: |
| | | pass |
| | | |
| | | # 如果正则没有匹配到,使用NER结果 |
| | | if not result["balance"] and entities["BALANCE"]: |
| | | for amount in entities["BALANCE"]: |
| | | cleaned_amount = clean_amount(amount, text) |
| | | if cleaned_amount: |
| | |
| | | app = Flask(__name__) |
| | | model_manager = ModelManager() |
| | | |
| | | # 添加保存短信到文件的函数 |
| | | def save_sms_to_file(text: str, category: str = None, confidence: float = None) -> bool: |
| | | """ |
| | | 将短信内容保存到本地文件 |
| | | |
| | | Args: |
| | | text: 短信内容 |
| | | category: 分类结果 |
| | | confidence: 分类置信度 |
| | | |
| | | Returns: |
| | | bool: 保存成功返回True,否则返回False |
| | | """ |
| | | try: |
| | | # 确保日志目录存在 |
| | | log_dir = "./sms_logs" |
| | | if not os.path.exists(log_dir): |
| | | os.makedirs(log_dir) |
| | | |
| | | # 创建基于日期的文件名 |
| | | today = datetime.datetime.now().strftime("%Y-%m-%d") |
| | | file_path = os.path.join(log_dir, f"sms_log_{today}.txt") |
| | | |
| | | # 获取当前时间 |
| | | current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| | | |
| | | # 准备要写入的内容 |
| | | category_info = f"分类: {category}, 置信度: {confidence:.4f}" if category and confidence else "未分类" |
| | | log_content = f"[{current_time}] {category_info}\n{text}\n{'='*50}\n" |
| | | |
| | | # 以追加模式写入文件 |
| | | with open(file_path, 'a', encoding='utf-8') as f: |
| | | f.write(log_content) |
| | | |
| | | return True |
| | | except Exception as e: |
| | | logger.error(f"保存短信到文件失败: {str(e)}") |
| | | return False |
| | | |
| | | @app.route("/health", methods=["GET"]) |
| | | def health_check(): |
| | | """健康检查接口""" |
| | |
| | | if not isinstance(text, str) or not text.strip(): |
| | | raise BadRequest("短信内容不能为空") |
| | | |
| | | # 处理短信 |
| | | category = model_manager.classify_sms(text) |
| | | if category == "快递": |
| | | details = model_manager.extract_entities(text) |
| | | elif category == "还款": |
| | | # 保存原始短信内容到文件 |
| | | save_sms_to_file(text) |
| | | |
| | | # 特定短信识别逻辑 - 针对百度通知和招商银行账单 |
| | | # 识别百度通知 |
| | | if "百度智能云" in text and "尊敬的用户" in text and "免费额度" in text: |
| | | logger.info(f"直接识别为百度通知短信: {text[:30]}...") |
| | | category = "其他" |
| | | save_sms_to_file(text, category, 1.0) # 记录分类结果 |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": {} |
| | | } |
| | | }) |
| | | |
| | | # 识别招商银行账单 |
| | | if "招商银行" in text and ("账单¥" in text or "账单¥" in text or "还款日" in text): |
| | | logger.info(f"直接识别为招商银行还款短信: {text[:30]}...") |
| | | category = "还款" |
| | | details = model_manager.extract_repayment_entities(text) |
| | | elif category == "收入": |
| | | details = model_manager.extract_income_entities(text) |
| | | elif category == "航班": |
| | | details = model_manager.extract_flight_entities(text) |
| | | elif category == "火车票": # 添加火车票类别处理 |
| | | details = model_manager.extract_train_entities(text) |
| | | else: |
| | | details = {} |
| | | |
| | | # 记录处理结果 |
| | | logger.info(f"Successfully processed SMS: {text[:30]}...") |
| | | |
| | | save_sms_to_file(text, category, 1.0) # 记录分类结果 |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | |
| | | } |
| | | }) |
| | | |
| | | # 处理短信 |
| | | category, confidence = model_manager.classify_sms(text) |
| | | |
| | | # 保存短信内容和分类结果 |
| | | save_sms_to_file(text, category, confidence) |
| | | |
| | | # 如果是明确的业务短信类别,直接进入处理流程 |
| | | if category in ["快递", "还款", "收入", "航班", "火车票"] and confidence > 0.5: |
| | | # 对百度通知的特殊处理 |
| | | if category == "快递" and "百度" in text and "尊敬的用户" in text: |
| | | logger.info(f"纠正百度通知短信的分类: {text[:30]}...") |
| | | category = "其他" |
| | | save_sms_to_file(text, category, confidence) # 更新分类结果 |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": {} |
| | | } |
| | | }) |
| | | |
| | | # 对于高置信度的业务分类,直接进入实体提取 |
| | | if category == "快递": |
| | | details = model_manager.extract_entities(text) |
| | | elif category == "还款": |
| | | details = model_manager.extract_repayment_entities(text) |
| | | elif category == "收入": |
| | | details = model_manager.extract_income_entities(text) |
| | | elif category == "航班": |
| | | details = model_manager.extract_flight_entities(text) |
| | | elif category == "火车票": |
| | | details = model_manager.extract_train_entities(text) |
| | | |
| | | logger.info(f"高置信度业务短信: {text[:30]}..., category: {category}, confidence: {confidence:.4f}") |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": details |
| | | } |
| | | }) |
| | | |
| | | # 检查是否为营销/广告短信 |
| | | if model_manager.is_marketing_sms(text): |
| | | # 如果是营销/广告短信,直接归类为"其他" |
| | | logger.info(f"检测到营销/广告短信: {text[:30]}...") |
| | | category = "其他" |
| | | save_sms_to_file(text, category, confidence) # 更新分类结果 |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": {} |
| | | } |
| | | }) |
| | | |
| | | # 检查是否为通知类短信 |
| | | if model_manager.is_notification_sms(text): |
| | | # 如果是通知类短信,直接归类为"其他" |
| | | logger.info(f"检测到通知类短信: {text[:30]}...") |
| | | category = "其他" |
| | | save_sms_to_file(text, category, confidence) # 更新分类结果 |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": {} |
| | | } |
| | | }) |
| | | |
| | | # 置信度阈值,低于此阈值的分类结果被视为"其他" |
| | | confidence_threshold = 0.7 |
| | | if confidence < confidence_threshold: |
| | | logger.info(f"短信分类置信度低({confidence:.4f}),归类为'其他': {text[:30]}...") |
| | | category = "其他" |
| | | save_sms_to_file(text, category, confidence) # 更新分类结果 |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": {} |
| | | } |
| | | }) |
| | | |
| | | # 根据分类结果调用对应的实体提取函数 |
| | | if category == "快递": |
| | | details = model_manager.extract_entities(text) |
| | | elif category == "还款": |
| | | details = model_manager.extract_repayment_entities(text) |
| | | elif category == "收入": |
| | | details = model_manager.extract_income_entities(text) |
| | | elif category == "航班": |
| | | details = model_manager.extract_flight_entities(text) |
| | | elif category == "火车票": |
| | | details = model_manager.extract_train_entities(text) |
| | | else: |
| | | details = {} |
| | | |
| | | # 记录处理结果 |
| | | logger.info(f"Successfully processed SMS: {text[:30]}..., category: {category}, confidence: {confidence:.4f}") |
| | | |
| | | return jsonify({ |
| | | "status": "success", |
| | | "data": { |
| | | "category": category, |
| | | "details": details |
| | | } |
| | | }) |
| | | save_sms_to_file |
| | | except BadRequest as e: |
| | | logger.warning(f"Invalid request: {str(e)}") |
| | | return jsonify({ |