model_classifier.git

			@@ -1,6 +1,7 @@
			# -- coding: utf-8 --
			import os
			import logging
			import datetime
			from typing import Dict, Optional, Tuple

			from flask import Flask, request, jsonify
			@@ -124,8 +125,8 @@
			logger.error(f"加载火车票模型失败: {str(e)}")
			raise

			def classify_sms(self, text: str) -> str:
			"""对短信进行分类"""
			def classify_sms(self, text: str) -> Tuple[str, float]:
			"""对短信进行分类，并返回置信度"""
			try:
			inputs = self.classifier_tokenizer(
			text,
			@@ -135,11 +136,243 @@
			)
			with torch.no_grad():
			outputs = self.classifier_model(**inputs)
			pred_id = outputs.logits.argmax().item()
			return self.classifier_model.config.id2label[pred_id]

			# 获取预测标签及其对应的概率
			logits = outputs.logits
			probabilities = torch.softmax(logits, dim=1)
			pred_id = logits.argmax().item()
			confidence = probabilities[0, pred_id].item() # 获取预测标签的置信度

			return self.classifier_model.config.id2label[pred_id], confidence
			except Exception as e:
			logger.error(f"短信分类失败: {str(e)}")
			raise

			def is_marketing_sms(self, text: str) -> bool:
			"""判断是否为营销/广告类短信，采用评分系统"""
			# 特定字符串模式检查：直接匹配明显的营销/通知短信
			marketing_patterns = [
			# 百度类通知
			r"百度智能云.*?尊敬的用户",
			r"百度.?账户.?tokens",
			r"AppBuilder.*?账户",
			r"账户有.*?免费额度",
			r".?免费额度.?过期",
			r"dwz\.cn\/[A-Za-z0-9]+"
			]

			# 对特定模式直接判断
			for pattern in marketing_patterns:
			if re.search(pattern, text):
			return True # 直接认为是营销短信

			# 评分系统：根据短信内容特征进行评分，超过阈值判定为营销短信
			score = 0

			# 强营销特征关键词（高权重）
			strong_marketing_keywords = [
			"有奖", "免费赠送", "抽奖", "中奖", "优惠券", "折扣券", "特价", "秒杀",
			"限时抢购", "促销", "推广", "广告", "代金券", "0元购", "tokens调用量"
			]

			# 一般营销特征关键词（中等权重）
			general_marketing_keywords = [
			"活动", "优惠", "折扣", "限时", "抢购", "特价", "promotion", "推广",
			"开业", "集点", "集赞", "关注", "公众号", "小程序", "注册有礼", "免费额度"
			]

			# 弱营销特征关键词（低权重，可能出现在正常短信中）
			weak_marketing_keywords = [
			"尊敬的用户", "尊敬的客户", "您好", "注册", "登录", "账户", "账号",
			"会员", "积分", "权益", "提醒", "即将", "有效期", "过期", "升级",
			"更新", "下载", "APP", "应用", "平台", "网址", "点击", "工单"
			]

			# 短网址和链接（独立评估，结合其他特征判断）
			url_patterns = [
			"dwz.cn", "t.cn", "短网址", "http://", "https://", "cmbt.cn"
			]

			# 业务短信特征（用于反向识别，降低误判率）
			# 快递短信特征
			express_keywords = [
			"快递", "包裹", "取件码", "取件", "签收", "派送", "配送", "物流",
			"驿站", "在途", "揽收", "暂存", "已到达", "丰巢", "柜取件", "柜机"
			]

			# 还款短信特征
			repayment_keywords = [
			"还款", "账单", "信用卡", "借款", "贷款", "逾期", "欠款", "最低还款",
			"应还金额", "到期还款", "还清", "应还", "还款日", "账单￥", "账单¥", "查账还款"
			]

			# 收入短信特征
			income_keywords = [
			"收入", "转账", "入账", "到账", "支付", "工资", "报销", "余额",
			"成功收款", "收到", "款项"
			]

			# 航班/火车票特征
			travel_keywords = [
			"航班", "航空", "飞机", "机票", "火车", "铁路", "列车", "车票",
			"出发", "抵达", "起飞", "登机", "候车", "检票"
			]

			# 额外增加：通知类短信特征（通常不需要处理的短信）
			notification_keywords = [
			"余额不足", "话费不足", "话费余额", "通讯费", "流量用尽", "流量不足",
			"停机", "恢复通话", "自动充值", "交费", "缴费",
			"消费提醒", "交易提醒", "动账", "短信通知", "验证码", "校验码", "安全码"
			]

			# 运营商标识
			telecom_keywords = [
			"中国电信", "中国移动", "中国联通", "电信", "移动", "联通",
			"携号转网", "号码服务", "通讯服务", "189.cn", "10086", "10010"
			]

			# 银行和金融机构标识
			bank_keywords = [
			"信用卡", "储蓄卡", "借记卡", "储蓄", "银联",
			"建设银行", "工商银行", "农业银行", "中国银行", "交通银行",
			"招商银行", "浦发银行", "民生银行", "兴业银行", "广发银行",
			"平安银行", "中信银行", "光大银行", "华夏银行", "邮储银行",
			"农商银行", "支付宝", "微信支付", "京东金融", "度小满", "陆金所"
			]

			# 特殊情况检查：招商银行账单短信，不应被过滤
			if ("招商银行" in text and ("账单" in text or "还款日" in text)) or "cmbt.cn" in text:
			if "还款" in text or "账单" in text or "消费卡" in text:
			return False # 是还款短信，不过滤

			# 计算评分
			# 首先检查业务短信特征，如果明确是业务短信，直接返回False
			has_express_feature = any(keyword in text for keyword in express_keywords)
			has_repayment_feature = any(keyword in text for keyword in repayment_keywords)
			has_income_feature = any(keyword in text for keyword in income_keywords)
			has_travel_feature = any(keyword in text for keyword in travel_keywords)

			# 检查是否为百度通知
			is_baidu_notification = "百度" in text and "尊敬的用户" in text
			if is_baidu_notification:
			return True # 百度通知应被过滤

			# 如果短信中包含多个业务关键词（≥2个），很可能是重要的业务短信
			business_score = (has_express_feature + has_repayment_feature +
			has_income_feature + has_travel_feature)
			if business_score >= 2 and not is_baidu_notification:
			return False # 多个业务特征同时存在，不太可能是营销短信

			# 检查强营销特征
			for keyword in strong_marketing_keywords:
			if keyword in text:
			score += 3

			# 检查一般营销特征
			for keyword in general_marketing_keywords:
			if keyword in text:
			score += 2

			# 检查弱营销特征
			for keyword in weak_marketing_keywords:
			if keyword in text:
			score += 1

			# 检查URL特征（结合是否存在业务特征）
			has_url = any(pattern in text for pattern in url_patterns)

			# 降低业务特征短信的营销判定分数
			if has_express_feature and not is_baidu_notification:
			score -= 3 # 快递特征明显减分

			if has_repayment_feature:
			score -= 3 # 还款特征明显减分

			if has_income_feature:
			score -= 2 # 收入特征减分

			if has_travel_feature:
			score -= 2 # 旅行特征减分

			# 检查通知类短信特征（但不包括重要的业务短信）
			if not has_express_feature and not has_repayment_feature: # 确保不是快递和还款短信
			notification_count = sum(1 for keyword in notification_keywords if keyword in text)
			if notification_count >= 2: # 需要至少2个通知关键词才判定
			score += notification_count # 增加判定为营销/通知短信的可能性

			# 检查运营商和银行标识（结合其他特征判断）
			has_telecom_feature = any(keyword in text for keyword in telecom_keywords)
			has_bank_feature = any(keyword in text for keyword in bank_keywords)

			# URL的评分处理
			if has_url:
			if (has_express_feature or has_repayment_feature or has_income_feature or has_travel_feature) and not is_baidu_notification:
			# URL在业务短信中可能是正常的追踪链接，不增加评分
			pass
			else:
			# 纯URL且无业务特征，可能是营销短信
			score += 2

			# 特殊情况：运营商余额通知
			if has_telecom_feature and "余额" in text and not has_income_feature:
			score += 2

			# 设置判定阈值
			threshold = 4 # 需要至少4分才判定为营销短信

			return score >= threshold

			def is_notification_sms(self, text: str) -> bool:
			"""判断是否为通知类短信（如银行交易通知、运营商提醒等）"""
			# 银行交易通知特征（不包括还款提醒）
			bank_transaction_patterns = [
			r"您尾号\d+的.+消费",
			r"您.+账户消费[\d,.]+元",
			r"交易[\d,.]+元",
			r"支付宝.+消费",
			r"微信支付.+消费",
			r"\d{1,2}月\d{1,2}日\d{1,2}[:：]\d{1,2}消费",
			r"银行卡([支付\|消费\|扣款])"
			]

			# 排除规则：包含以下关键词的短信不应被判定为通知短信
			business_keywords = [
			# 还款关键词
			"还款", "账单", "应还", "到期还款", "还款日", "最低还款", "账单￥", "账单¥", "查账还款",
			# 快递关键词
			"快递", "包裹", "取件码", "取件", "签收", "派送", "配送",
			# 收入关键词
			"收入", "转账", "入账", "到账", "支付成功", "工资"
			]

			# 运营商余额通知特征
			telecom_balance_patterns = [
			r"余额[不足\|低于][\d,.]+元",
			r"话费[不足\|仅剩][\d,.]+元",
			r"流量[不足\|即将用尽]",
			r"[电信\|移动\|联通].+余额",
			r"[停机\|停号]提醒",
			r"为了保障您的正常通讯",
			]

			# 首先检查是否包含业务关键词，有则不应判定为通知短信
			for keyword in business_keywords:
			if keyword in text:
			return False # 包含业务关键词，不是需要过滤的通知短信

			# 检查银行交易通知模式
			for pattern in bank_transaction_patterns:
			if re.search(pattern, text):
			logger.debug(f"识别到银行交易通知短信：{text[:30]}...")
			return True

			# 检查运营商余额通知模式
			for pattern in telecom_balance_patterns:
			if re.search(pattern, text):
			logger.debug(f"识别到运营商余额通知短信：{text[:30]}...")
			return True

			return False

			def extract_entities(self, text: str) -> Dict[str, Optional[str]]:
			"""提取文本中的实体"""
			@@ -504,8 +737,15 @@
			result["date"] = date

			# 处理金额
			# 先尝试使用正则表达式直接匹配金额
			# 尝试匹配带￥符号的账单金额模式
			amount_match = re.search(r'账单￥([\d,]+\.?\d*)', text)
			if not amount_match:
			# 尝试匹配带¥符号的账单金额模式
			amount_match = re.search(r'账单¥([\d,]+\.?\d*)', text)
			if not amount_match:
			# 尝试匹配一般金额模式
			amount_match = re.search(r'(?:应还\|还款)?金额([\d,]+\.?\d*)(?:元\|块钱\|块\|万元\|万)?', text)

			if amount_match:
			amount = amount_match.group(1) # 保留原始格式(带逗号)
			# 验证金额有效性
			@@ -531,9 +771,13 @@

			# 如果还是没有找到，尝试从文本中提取
			if not amount_candidates:
			# 使用更宽松的正则表达式匹配金额
			amount_pattern = re.compile(r'([\d,]+\.?\d*)(?:元\|块钱\|块\|万元\|万)')
			matches = list(amount_pattern.finditer(text))
			# 使用多个正则表达式匹配不同格式的金额
			# 1. 匹配带￥符号格式
			matches = list(re.finditer(r'￥([\d,]+\.?\d*)', text))
			# 2. 匹配带¥符号格式
			matches.extend(list(re.finditer(r'¥([\d,]+\.?\d*)', text)))
			# 3. 匹配一般金额格式
			matches.extend(list(re.finditer(r'([\d,]+\.?\d*)(?:元\|块钱\|块\|万元\|万)', text)))

			for match in matches:
			amount_text = match.group(1) # 获取数字部分，保留逗号
			@@ -711,6 +955,24 @@
			result["datetime"] = datetime

			# 处理收入金额
			# 先尝试使用正则表达式直接匹配收入金额，包括"收入金额"格式
			amount_match = re.search(r'收入金额([\d,]+\.?\d*)元', text)
			if not amount_match:
			# 尝试匹配一般收入格式
			amount_match = re.search(r'收入([\d,]+\.?\d*)元', text)

			if amount_match:
			amount = amount_match.group(1) # 保留原始格式(带逗号)
			# 验证金额有效性
			try:
			value = float(amount.replace(',', ''))
			if value > 0:
			result["amount"] = amount
			except ValueError:
			pass

			# 如果正则没有匹配到，继续尝试NER结果
			if not result["amount"]:
			amount_candidates = []
			# 首先从识别的实体中获取
			for amount in entities["PICKUP_CODE"]:
			@@ -724,9 +986,25 @@

			# 如果没有找到有效金额，直接从文本中尝试提取
			if not amount_candidates:
			# 直接在整个文本中寻找金额模式
			amount_pattern = re.compile(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?\|\d+(?:\.\d{1,2})?)')
			matches = list(amount_pattern.finditer(text))
			# 尝试多种模式匹配金额
			# 1. 匹配"收入金额xxx元"模式
			matches = list(re.finditer(r'收入金额([\d,]+\.?\d*)元', text))
			# 2. 匹配"收入xxx元"模式
			matches.extend(list(re.finditer(r'收入([\d,]+\.?\d*)元', text)))
			# 3. 匹配带元结尾的金额
			matches.extend(list(re.finditer(r'([0-9,]+\.[0-9]+)元', text)))
			# 4. 匹配普通数字(可能是余额)，但排除已识别为余额的金额
			if "余额" in text:
			balance_match = re.search(r'余额([\d,]+\.?\d*)元', text)
			if balance_match:
			balance_value = balance_match.group(1)
			# 只匹配不等于余额的金额
			all_numbers = re.finditer(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?\|\d+(?:\.\d{1,2})?)', text)
			for match in all_numbers:
			if match.group(1) != balance_value:
			matches.append(match)
			else:
			matches.extend(list(re.finditer(r'(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?\|\d+(?:\.\d{1,2})?)', text)))

			for match in matches:
			amount_text = match.group(1)
			@@ -736,12 +1014,57 @@
			except ValueError:
			continue

			# 选择最合适的有效金额
			# 从金额候选中排除已识别的余额值
			if result["balance"]:
			try:
			balance_value = float(result["balance"].replace(',', ''))
			amount_candidates = [(text, value) for text, value in amount_candidates if abs(value - balance_value) > 0.01]
			except ValueError:
			pass

			# 选择适当的金额作为收入
			if amount_candidates:
			has_income_amount_keyword = "收入金额" in text

			if has_income_amount_keyword:
			# 查找"收入金额"附近的数字
			idx = text.find("收入金额")
			if idx != -1:
			closest_amount = None
			min_distance = float('inf')
			for amount_text, value in amount_candidates:
			# 找到这个数字在原文中的位置
			amount_idx = text.find(amount_text)
			if amount_idx != -1:
			distance = abs(amount_idx - idx)
			if distance < min_distance:
			min_distance = distance
			closest_amount = amount_text

			if closest_amount:
			result["amount"] = closest_amount
			else:
			# 如果无法找到最近的金额，使用最大金额策略
			result["amount"] = max(amount_candidates, key=lambda x: x[1])[0]
			else:
			# 如果没有"收入金额"关键词，则使用最大金额策略
			result["amount"] = max(amount_candidates, key=lambda x: x[1])[0]

			# 处理余额
			if entities["BALANCE"]:
			# 先尝试使用正则表达式直接匹配余额
			balance_match = re.search(r'余额([\d,]+\.?\d*)元', text)
			if balance_match:
			balance = balance_match.group(1) # 保留原始格式(带逗号)
			# 验证金额有效性
			try:
			value = float(balance.replace(',', ''))
			if value > 0:
			result["balance"] = balance
			except ValueError:
			pass

			# 如果正则没有匹配到，使用NER结果
			if not result["balance"] and entities["BALANCE"]:
			for amount in entities["BALANCE"]:
			cleaned_amount = clean_amount(amount, text)
			if cleaned_amount:
			@@ -974,6 +1297,45 @@
			app = Flask(__name__)
			model_manager = ModelManager()

			# 添加保存短信到文件的函数
			def save_sms_to_file(text: str, category: str = None, confidence: float = None) -> bool:
			"""
			将短信内容保存到本地文件

			Args:
			text: 短信内容
			category: 分类结果
			confidence: 分类置信度

			Returns:
			bool: 保存成功返回True，否则返回False
			"""
			try:
			# 确保日志目录存在
			log_dir = "./sms_logs"
			if not os.path.exists(log_dir):
			os.makedirs(log_dir)

			# 创建基于日期的文件名
			today = datetime.datetime.now().strftime("%Y-%m-%d")
			file_path = os.path.join(log_dir, f"sms_log_{today}.txt")

			# 获取当前时间
			current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

			# 准备要写入的内容
			category_info = f"分类: {category}, 置信度: {confidence:.4f}" if category and confidence else "未分类"
			log_content = f"[{current_time}] {category_info}\n{text}\n{'='*50}\n"

			# 以追加模式写入文件
			with open(file_path, 'a', encoding='utf-8') as f:
			f.write(log_content)

			return True
			except Exception as e:
			logger.error(f"保存短信到文件失败: {str(e)}")
			return False

			@app.route("/health", methods=["GET"])
			def health_check():
			"""健康检查接口"""
			@@ -995,24 +1357,29 @@
			if not isinstance(text, str) or not text.strip():
			raise BadRequest("短信内容不能为空")

			# 处理短信
			category = model_manager.classify_sms(text)
			if category == "快递":
			details = model_manager.extract_entities(text)
			elif category == "还款":
			# 保存原始短信内容到文件
			save_sms_to_file(text)

			# 特定短信识别逻辑 - 针对百度通知和招商银行账单
			# 识别百度通知
			if "百度智能云" in text and "尊敬的用户" in text and "免费额度" in text:
			logger.info(f"直接识别为百度通知短信: {text[:30]}...")
			category = "其他"
			save_sms_to_file(text, category, 1.0) # 记录分类结果
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": {}
			}
			})

			# 识别招商银行账单
			if "招商银行" in text and ("账单￥" in text or "账单¥" in text or "还款日" in text):
			logger.info(f"直接识别为招商银行还款短信: {text[:30]}...")
			category = "还款"
			details = model_manager.extract_repayment_entities(text)
			elif category == "收入":
			details = model_manager.extract_income_entities(text)
			elif category == "航班":
			details = model_manager.extract_flight_entities(text)
			elif category == "火车票": # 添加火车票类别处理
			details = model_manager.extract_train_entities(text)
			else:
			details = {}

			# 记录处理结果
			logger.info(f"Successfully processed SMS: {text[:30]}...")

			save_sms_to_file(text, category, 1.0) # 记录分类结果
			return jsonify({
			"status": "success",
			"data": {
			@@ -1021,6 +1388,115 @@
			}
			})

			# 处理短信
			category, confidence = model_manager.classify_sms(text)

			# 保存短信内容和分类结果
			save_sms_to_file(text, category, confidence)

			# 如果是明确的业务短信类别，直接进入处理流程
			if category in ["快递", "还款", "收入", "航班", "火车票"] and confidence > 0.5:
			# 对百度通知的特殊处理
			if category == "快递" and "百度" in text and "尊敬的用户" in text:
			logger.info(f"纠正百度通知短信的分类: {text[:30]}...")
			category = "其他"
			save_sms_to_file(text, category, confidence) # 更新分类结果
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": {}
			}
			})

			# 对于高置信度的业务分类，直接进入实体提取
			if category == "快递":
			details = model_manager.extract_entities(text)
			elif category == "还款":
			details = model_manager.extract_repayment_entities(text)
			elif category == "收入":
			details = model_manager.extract_income_entities(text)
			elif category == "航班":
			details = model_manager.extract_flight_entities(text)
			elif category == "火车票":
			details = model_manager.extract_train_entities(text)

			logger.info(f"高置信度业务短信: {text[:30]}..., category: {category}, confidence: {confidence:.4f}")
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": details
			}
			})

			# 检查是否为营销/广告短信
			if model_manager.is_marketing_sms(text):
			# 如果是营销/广告短信，直接归类为"其他"
			logger.info(f"检测到营销/广告短信: {text[:30]}...")
			category = "其他"
			save_sms_to_file(text, category, confidence) # 更新分类结果
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": {}
			}
			})

			# 检查是否为通知类短信
			if model_manager.is_notification_sms(text):
			# 如果是通知类短信，直接归类为"其他"
			logger.info(f"检测到通知类短信: {text[:30]}...")
			category = "其他"
			save_sms_to_file(text, category, confidence) # 更新分类结果
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": {}
			}
			})

			# 置信度阈值，低于此阈值的分类结果被视为"其他"
			confidence_threshold = 0.7
			if confidence < confidence_threshold:
			logger.info(f"短信分类置信度低({confidence:.4f})，归类为'其他': {text[:30]}...")
			category = "其他"
			save_sms_to_file(text, category, confidence) # 更新分类结果
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": {}
			}
			})

			# 根据分类结果调用对应的实体提取函数
			if category == "快递":
			details = model_manager.extract_entities(text)
			elif category == "还款":
			details = model_manager.extract_repayment_entities(text)
			elif category == "收入":
			details = model_manager.extract_income_entities(text)
			elif category == "航班":
			details = model_manager.extract_flight_entities(text)
			elif category == "火车票":
			details = model_manager.extract_train_entities(text)
			else:
			details = {}

			# 记录处理结果
			logger.info(f"Successfully processed SMS: {text[:30]}..., category: {category}, confidence: {confidence:.4f}")

			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": details
			}
			})
			save_sms_to_file
			except BadRequest as e:
			logger.warning(f"Invalid request: {str(e)}")
			return jsonify({