model_classifier.git

			@@ -10,6 +10,11 @@
			from werkzeug.exceptions import BadRequest
			from ner_config import NERConfig, RepaymentNERConfig, IncomeNERConfig, FlightNERConfig, TrainNERConfig
			import re
			import threading
			import subprocess

			from test2 import process_films


			# 配置日志
			logging.basicConfig(
			@@ -203,13 +208,14 @@
			# 还款短信特征
			repayment_keywords = [
			"还款", "账单", "信用卡", "借款", "贷款", "逾期", "欠款", "最低还款",
			"应还金额", "到期还款", "还清", "应还", "还款日", "账单￥", "账单¥", "查账还款"
			"应还金额", "到期还款", "还清", "应还", "还款日", "账单￥", "账单¥", "查账还款",
			"扣款用于", "房贷还款", "信用卡还款", "车贷还款", "应还款额", "最低还款额"
			]

			# 收入短信特征
			income_keywords = [
			"收入", "转账", "入账", "到账", "支付", "工资", "报销", "余额",
			"成功收款", "收到", "款项"
			"成功收款", "收到", "款项", "收入金额"
			]

			# 航班/火车票特征
			@@ -342,8 +348,15 @@
			# 快递关键词
			"快递", "包裹", "取件码", "取件", "签收", "派送", "配送",
			# 收入关键词
			"收入", "转账", "入账", "到账", "支付成功", "工资"
			"收入", "转账", "入账", "到账", "支付成功", "工资", "支付宝转账", "微信转账"
			]

			# 先检查是否为收入短信（优先于通知判断）
			income_indicators = ["收入", "入账", "转账", "工资"]
			for indicator in income_indicators:
			if indicator in text and "元" in text:
			# 可能是收入短信，不要判断为通知
			return False

			# 运营商余额通知特征
			telecom_balance_patterns = [
			@@ -458,20 +471,62 @@
			if word in code:
			code = code[:code.index(word)]

			# 只保留字母、数字和连字符
			code = ''.join(c for c in code if c.isalnum() or c == "-")
			# 针对不同快递公司采取不同策略
			if "丰巢" in text or "蜂巢" in text:
			# 对于丰巢快递，只保留数字
			code = ''.join(c for c in code if c.isdigit())
			elif "中国邮政" in text:
			# 对于中国邮政，尝试提取完整货位号
			cargo_match = re.search(r'货号[:：]\s*([0-9A-Za-z\-]+)', text)
			if cargo_match:
			code = cargo_match.group(1)
			else:
			# 保留原始格式（字母、数字和连字符）
			code = ''.join(c for c in code if c.isalnum() or c == "-")
			else:
			# 对于其他快递，保留字母、数字和连字符
			code = ''.join(c for c in code if c.isalnum() or c == "-")

			# 确保格式正确
			parts = code.split("-")
			valid_parts = []
			for part in parts:
			if part and any(c.isalnum() for c in part):
			valid_parts.append(part)

			if valid_parts:
			result["pickup_code"] = "-".join(valid_parts)
			if "丰巢" in text or "蜂巢" in text:
			# 对于丰巢快递，只保留数字
			if code.isdigit():
			result["pickup_code"] = code
			else:
			# 如果没有数字，尝试再次从文本中匹配纯数字取件码
			pickup_code_match = re.search(r'码[^0-9]*(\d+)', text)
			if pickup_code_match:
			result["pickup_code"] = pickup_code_match.group(1)
			else:
			result["pickup_code"] = None
			elif "中国邮政" in text:
			# 对于中国邮政，验证格式是否合理
			if re.match(r'[0-9A-Za-z\-]+', code):
			result["pickup_code"] = code
			else:
			# 二次尝试：从文本中直接获取货位号
			cargo_match = re.search(r'货号[:：]\s*([0-9A-Za-z\-]+)', text)
			if cargo_match:
			result["pickup_code"] = cargo_match.group(1)
			else:
			# 最后尝试：提取"取件密码"后的纯数字序列
			password_match = re.search(r'密码\s*(\d+)', text)
			if password_match:
			result["pickup_code"] = password_match.group(1)
			else:
			result["pickup_code"] = None
			else:
			result["pickup_code"] = None
			# 对于其他快递，保持原有逻辑
			parts = code.split("-")
			valid_parts = []
			for part in parts:
			if part and any(c.isalnum() for c in part):
			valid_parts.append(part)

			if valid_parts:
			result["pickup_code"] = "-".join(valid_parts)
			else:
			result["pickup_code"] = None

			# 清理公司名称
			if result["company"]:
			@@ -480,7 +535,16 @@
			for word in invalid_words:
			if company.endswith(word):
			company = company[:-len(word)]

			# 特殊处理中国邮政
			if company == "中国" and "中国邮政" in text:
			company = "邮政"
			elif "中国邮政" in text and not company:
			company = "邮政"

			result["company"] = company.strip()
			elif "中国邮政" in text: # 如果NER未识别但文本中有中国邮政
			result["company"] = "邮政"

			# 清理地址
			if result["address"]:
			@@ -623,7 +687,8 @@

			# 尝试查找最低还款金额
			min_amount_match = re.search(r'最低还款([\d,]+\.?\d*)(?:元\|块钱\|块\|万元\|万)?', context)
			if min_amount_match and "MIN_CODE" in current_entity["type"]:
			# 修复：确保current_entity存在且有type属性再使用
			if min_amount_match and 'current_entity' in locals() and current_entity is not None and "MIN_CODE" in current_entity["type"]:
			return min_amount_match.group(1) # 直接返回匹配到的最低还款金额，保留原始格式

			# 在上下文中查找完整金额
			@@ -1360,6 +1425,103 @@
			# 保存原始短信内容到文件
			save_sms_to_file(text)

			# 已还清或已结清的短信模式
			already_paid_patterns = [
			r"已还清",
			r"已结清",
			r"还款.?入账后.?已还清",
			r"还款.?入账.?结清"
			]

			# 检查是否为已还清/已结清的短信
			for pattern in already_paid_patterns:
			if re.search(pattern, text):
			logger.info(f"识别为已还清/已结清短信，归类为其他: {text[:30]}...")
			category = "其他"
			save_sms_to_file(text, category, 1.0)
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": {}
			}
			})

			# 银行收入短信特征识别
			income_patterns = [
			r"收入金额[\d,.]+元",
			r"账户.?收入.?[\d,.]+元",
			r"账户.*?工资",
			r"工资.*?收入",
			r"入账[\d,.]+元",
			# 添加新模式匹配南京银行等类似格式
			r"收入\d+\.\d+元",
			r"账号.*?收入\d+\.\d+元",
			r"尾号\d+的账号.*?收入\d+\.\d+元",
			r"支付宝转账",
			r"转账.*?收入"
			]

			# 银行还款短信特征识别
			repayment_patterns = [
			r"信用卡.*?还款",
			r"账单.*?[\d,.]+元",
			r"应还款额.*?[\d,.]+元",
			r"最低还款额.*?[\d,.]+元",
			r"到期还款日",
			r"扣款.?用于.?还款",
			r"扣款.?用于.?贷款",
			r"扣款.?用于.?信用卡",
			r"车贷还款",
			r"房贷还款",
			r"贷款还款",
			r"信用卡账单",
			r"下次还款日"
			]

			# 检查是否为收入短信
			for pattern in income_patterns:
			if re.search(pattern, text):
			logger.info(f"识别为收入短信: {text[:30]}...")
			category = "收入"
			details = model_manager.extract_income_entities(text)
			save_sms_to_file(text, category, 1.0)
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": details
			}
			})

			# 检查是否为还款短信
			for pattern in repayment_patterns:
			if re.search(pattern, text):
			# 二次检查：如果包含"已还清"或"已结清"等词，归类为"其他"
			if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns):
			logger.info(f"虽然识别为还款短信，但包含已还清/已结清，归类为其他: {text[:30]}...")
			category = "其他"
			save_sms_to_file(text, category, 1.0)
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": {}
			}
			})

			logger.info(f"识别为还款短信: {text[:30]}...")
			category = "还款"
			details = model_manager.extract_repayment_entities(text)
			save_sms_to_file(text, category, 1.0)
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": details
			}
			})

			# 特定短信识别逻辑 - 针对百度通知和招商银行账单
			# 识别百度通知
			if "百度智能云" in text and "尊敬的用户" in text and "免费额度" in text:
			@@ -1376,6 +1538,19 @@

			# 识别招商银行账单
			if "招商银行" in text and ("账单￥" in text or "账单¥" in text or "还款日" in text):
			# 检查是否为已还清/已结清短信
			if any(re.search(paid_pattern, text) for paid_pattern in already_paid_patterns):
			logger.info(f"招商银行短信包含已还清/已结清，归类为其他: {text[:30]}...")
			category = "其他"
			save_sms_to_file(text, category, 1.0)
			return jsonify({
			"status": "success",
			"data": {
			"category": category,
			"details": {}
			}
			})

			logger.info(f"直接识别为招商银行还款短信: {text[:30]}...")
			category = "还款"
			details = model_manager.extract_repayment_entities(text)
			@@ -1511,5 +1686,55 @@
			"message": "服务器内部错误"
			}), 500


			def run_douban_spider(type_, tag):
			subprocess.run(
			["python", "douban.py", type_, tag],
			capture_output=True,
			text=True
			)

			def run_process_films():
			try:
			process_films()
			except Exception as e:
			logger.error(f"处理电影内容失败: {str(e)}")

			@app.route("/crawl-douban", methods=["POST"])
			def crawl_douban():
			try:
			default_params = {"type": "tv", "tag": "热门"}

			# 尝试解析 JSON，失败则使用默认值
			# 先检查 Content-Type 和数据结构
			if request.is_json:
			params = request.get_json()
			if not isinstance(params, dict): # 确保是字典类型
			params = default_params
			else:
			params = default_params

			# 获取参数（带默认值）
			type_ = params.get("type", default_params["type"])
			tag = params.get("tag", default_params["tag"])
			# 打印接收到的参数（调试用）
			print(f"Received params - type: {type_}, tag: {tag}")

			# 启动线程异步执行爬虫
			t = threading.Thread(target=run_douban_spider, args=(type_, tag))
			t.start()

			return jsonify({"status": "success", "message": "爬虫任务已启动", "type": type_, "tag": tag})
			except Exception as e:
			return jsonify({"status": "error", "message": str(e)}), 500

			@app.route('/generate-film-content', methods=['POST'])
			def generate_film_content_api():
			"""生成电影内容接口"""
			try:
			threading.Thread(target=run_process_films).start()
			return jsonify({"status": "accepted", "message": "任务已异步启动"})
			except Exception as e:
			return jsonify({"status": "error", "message": str(e)}), 500
			if __name__ == "__main__":
			app.run(host="0.0.0.0", port=5000)