class NERConfig: # 优化模型参数 MODEL_NAME = "bert-base-chinese" MAX_LENGTH = 128 # 获取更多上下文 BATCH_SIZE = 4 EPOCHS = 10 # 增加训练轮数 LEARNING_RATE = 3e-5 WARMUP_RATIO = 0.1 WEIGHT_DECAY = 0.01 # 数据增强配置 USE_DATA_AUGMENTATION = False # 暂时关闭数据增强 AUGMENTATION_RATIO = 0.3 # 训练策略 GRADIENT_ACCUMULATION_STEPS = 4 EVAL_STEPS = 25 # 更频繁的验证 LOGGING_STEPS = 10 # 路径配置 DATA_PATH = "data/sms_ner.txt" MODEL_PATH = "./models/ner_model" LOG_PATH = "./logs_ner" # 训练配置优化 SEED = 42 TEST_SIZE = 0.1 # 减少测试集比例,增加训练数据 EARLY_STOPPING_PATIENCE = 2 # 增加早停耐心值 # 交叉验证配置 N_SPLITS = 3 # CPU环境下减少折数 N_SEEDS = 1 # CPU环境下减少种子数量 # 确保标签列表完整 LABELS = [ "O", "B-POST", "I-POST", "B-COMPANY", "I-COMPANY", "B-ADDRESS", "I-ADDRESS", "B-PICKUP_CODE", "I-PICKUP_CODE", # 添加取件码标签 "B-STATION", "I-STATION" # 添加站点标签 ] # 增加训练数据质量控制 MAX_ENTITY_LENGTH = { "PICKUP_CODE": 15, # 限制取件码最大长度 "ADDRESS": 50, "COMPANY": 10, "POST": 10, "STATION": 20 } # 取件码相关配置 PICKUP_CODE_CONFIG = { 'max_length': 15, # 取件码最大长度 'invalid_words': ['的', '来', '到', '取', '件', '码'], # 取件码中不应该出现的词 'valid_separators': ['-'], # 有效的分隔符 'min_parts': 2, # 最少分段数 'max_parts': 4, # 最多分段数 'part_length': { 'min': 1, # 每段最少字符数 'max': 4 # 每段最多字符数 } } class RepaymentNERConfig: # 优化模型参数 MODEL_NAME = "bert-base-chinese" MAX_LENGTH = 128 BATCH_SIZE = 4 EPOCHS = 10 LEARNING_RATE = 3e-5 WARMUP_RATIO = 0.1 WEIGHT_DECAY = 0.01 # 数据增强配置 USE_DATA_AUGMENTATION = False AUGMENTATION_RATIO = 0.3 # 训练策略 GRADIENT_ACCUMULATION_STEPS = 4 EVAL_STEPS = 25 LOGGING_STEPS = 10 SAVE_STEPS = 25 # 添加保存步数 SAVE_TOTAL_LIMIT = 2 # 添加保存检查点数量限制 # 路径配置 DATA_PATH = "data/repayment.txt" MODEL_PATH = "./models/repayment_model" LOG_PATH = "./logs_repayment" # 训练配置优化 SEED = 42 TEST_SIZE = 0.1 EARLY_STOPPING_PATIENCE = 2 # CPU环境配置 MAX_GRAD_NORM = 1.0 FP16 = False # CPU环境下关闭FP16 # CPU环境下的数据加载优化 DATALOADER_NUM_WORKERS = 0 # CPU环境下设为0 DATALOADER_PIN_MEMORY = False # CPU环境下关闭 # 交叉验证配置 N_SPLITS = 3 N_SEEDS = 1 # 标签列表 LABELS = [ "O", "B-BANK", "I-BANK", # 还款主体 "B-TYPE", "I-TYPE", # 还款类型(花呗/信用卡/房贷等) "B-PICKUP_CODE", "I-PICKUP_CODE", # 还款金额 "B-DATE", "I-DATE", # 还款日期 "B-NUMBER", "I-NUMBER", # 账号尾号 "B-MIN_CODE", "I-MIN_CODE" # 最低还款金额 ] # 金额相关配置 AMOUNT_CONFIG = { 'max_length': 20, # 增加金额最大长度 'invalid_words': ['共', '计', '额', '为', '是', '人民币', '¥', '¥', '元', '应还', '本期'], 'valid_separators': ['.', ','], # 有效的分隔符 'decimal_places': 2, # 小数位数 'max_integer_digits': 12, # 增加整数部分最大位数 'currency_symbols': ['¥', '¥', 'RMB', '元'], # 货币符号 'amount_keywords': ['金额', '应还', '欠款', '待还', '本金', '本期'], 'min_amount_keywords': ['最低', '最少', '至少'], 'decimal_context_range': 3 # 查找小数点的上下文范围 } # 修改实体长度限制 MAX_ENTITY_LENGTH = { "BANK": 15, # 增加银行名称最大长度 "TYPE": 10, # 增加还款类型最大长度 "PICKUP_CODE": 20, # 金额最大长度 "DATE": 15, # 增加日期最大长度 "NUMBER": 10, # 增加账号尾号最大长度 "MIN_CODE": 20 # 最低还款金额最大长度 } class IncomeNERConfig: # 优化模型参数 (与 RepaymentNERConfig 保持一致) MODEL_NAME = "bert-base-chinese" MAX_LENGTH = 128 BATCH_SIZE = 4 EPOCHS = 10 LEARNING_RATE = 3e-5 WARMUP_RATIO = 0.1 WEIGHT_DECAY = 0.01 # 数据增强配置 USE_DATA_AUGMENTATION = False AUGMENTATION_RATIO = 0.3 # 训练策略 GRADIENT_ACCUMULATION_STEPS = 4 EVAL_STEPS = 25 LOGGING_STEPS = 10 SAVE_STEPS = 25 SAVE_TOTAL_LIMIT = 2 # 路径配置 DATA_PATH = "data/income.txt" MODEL_PATH = "./models/income_model" LOG_PATH = "./logs_income" # 训练配置优化 SEED = 42 TEST_SIZE = 0.1 EARLY_STOPPING_PATIENCE = 2 # CPU环境配置 MAX_GRAD_NORM = 1.0 FP16 = False # CPU环境下的数据加载优化 DATALOADER_NUM_WORKERS = 0 DATALOADER_PIN_MEMORY = False # 交叉验证配置 N_SPLITS = 3 N_SEEDS = 1 # 标签列表 LABELS = [ "O", "B-BANK", "I-BANK", # 银行名称 "B-NUMBER", "I-NUMBER", # 账号尾号 "B-DATATIME", "I-DATATIME", # 交易时间 "B-PICKUP_CODE", "I-PICKUP_CODE", # 收入金额 "B-BALANCE", "I-BALANCE" # 余额 ] # 实体长度限制 MAX_ENTITY_LENGTH = { "BANK": 15, # 银行名称最大长度 "NUMBER": 10, # 账号尾号最大长度 "DATATIME": 20, # 时间最大长度 "PICKUP_CODE": 20, # 收入金额最大长度 "BALANCE": 20 # 余额最大长度 } # 金额相关配置 AMOUNT_CONFIG = { 'max_length': 20, # 金额最大长度 'invalid_words': ['共', '计', '额', '为', '是', '人民币', '¥', '¥', '元'], 'valid_separators': ['.', ','], # 有效的分隔符 'decimal_places': 2, # 小数位数 'max_integer_digits': 12, # 整数部分最大位数 'currency_symbols': ['¥', '¥', 'RMB', '元'], # 货币符号 'decimal_context_range': 3 # 查找小数点的上下文范围 } class FlightNERConfig: # 优化模型参数 (与 RepaymentNERConfig 保持一致) MODEL_NAME = "bert-base-chinese" MAX_LENGTH = 128 BATCH_SIZE = 4 EPOCHS = 10 LEARNING_RATE = 3e-5 WARMUP_RATIO = 0.1 WEIGHT_DECAY = 0.01 # 训练策略 GRADIENT_ACCUMULATION_STEPS = 4 EVAL_STEPS = 25 LOGGING_STEPS = 10 SAVE_STEPS = 25 SAVE_TOTAL_LIMIT = 2 # 路径配置 DATA_PATH = "data/flight.txt" MODEL_PATH = "./models/flight_model" LOG_PATH = "./logs_flight" # 训练配置 SEED = 42 TEST_SIZE = 0.1 EARLY_STOPPING_PATIENCE = 2 # CPU环境配置 MAX_GRAD_NORM = 1.0 FP16 = False DATALOADER_NUM_WORKERS = 0 DATALOADER_PIN_MEMORY = False # 交叉验证配置 N_SPLITS = 3 N_SEEDS = 3 # 增加种子数量以提高模型稳定性 # 标签列表 - 保持与需求一致 LABELS = [ "O", "B-FLIGHT", "I-FLIGHT", # 航班号 "B-COMPANY", "I-COMPANY", # 航空公司 "B-START", "I-START", # 出发地 "B-END", "I-END", # 目的地 "B-DATE", "I-DATE", # 日期 "B-TIME", "I-TIME", # 时间 "B-DEPARTURE_TIME", "I-DEPARTURE_TIME", # 起飞时间 "B-ARRIVAL_TIME", "I-ARRIVAL_TIME", # 到达时间 "B-TICKET_NUM", "I-TICKET_NUM", # 机票号码 "B-SEAT", "I-SEAT" # 座位等信息 ] # 实体长度限制 - 更新键名与LABELS一致 MAX_ENTITY_LENGTH = { "FLIGHT": 10, # 航班号 "COMPANY": 15, # 航空公司 "START": 10, # 出发地 "END": 10, # 目的地 "DATE": 15, # 日期 "TIME": 10, # 时间 "DEPARTURE_TIME": 10, # 起飞时间 "ARRIVAL_TIME": 10, # 到达时间 "TICKET_NUM": 10, # 用户姓名 "SEAT": 10 # 座位等信息 } # 航班号配置 FLIGHT_CONFIG = { 'pattern': r'[A-Z]{2}\d{3,4}', 'min_length': 4, 'max_length': 7, 'carrier_codes': ['CA', 'MU', 'CZ', 'HU', '3U', 'ZH', 'FM', 'MF', 'SC', '9C'] # 常见航司代码 } class TrainNERConfig: # 模型参数 MODEL_NAME = "bert-base-chinese" MAX_LENGTH = 128 BATCH_SIZE = 4 EPOCHS = 10 LEARNING_RATE = 3e-5 WARMUP_RATIO = 0.1 WEIGHT_DECAY = 0.01 # 训练策略 GRADIENT_ACCUMULATION_STEPS = 4 EVAL_STEPS = 25 LOGGING_STEPS = 10 SAVE_STEPS = 25 SAVE_TOTAL_LIMIT = 2 # 路径配置 DATA_PATH = "data/train.txt" MODEL_PATH = "./models/train_model" LOG_PATH = "./logs_train" # 训练配置 SEED = 42 TEST_SIZE = 0.1 EARLY_STOPPING_PATIENCE = 2 # CPU环境配置 MAX_GRAD_NORM = 1.0 FP16 = False DATALOADER_NUM_WORKERS = 0 DATALOADER_PIN_MEMORY = False # 交叉验证配置 N_SPLITS = 3 N_SEEDS = 3 # 增加种子数量以提高模型稳定性 # 标签列表 LABELS = [ "O", "B-COMPANY", "I-COMPANY", # 车次 "B-TRIPS", "I-TRIPS", # 车次 "B-START", "I-START", # 出发站 "B-END", "I-END", # 到达站 "B-DATE", "I-DATE", # 日期 "B-TIME", "I-TIME", # 时间 "B-SEAT", "I-SEAT", # 座位等信息 "B-NAME", "I-NAME" # 用户姓名 ] # 实体长度限制 - 更新键名与LABELS一致 MAX_ENTITY_LENGTH = { "COMPANY": 8, # 12306 "TRIPS": 8, # 车次 "START": 10, # 出发站 "END": 10, # 到达站 "DATE": 15, # 日期 "TIME": 10, # 时间 "SEAT": 10, # 座位等信息 "NAME": 10 # 用户姓名 } # 车次配置 TRIPS_CONFIG = { 'patterns': [ r'[GDCZTKY]\d{1,2}', # G1, D1, C1等 r'[GDCZTKY]\d{1,2}/\d{1,2}', # G1/2等联运车次 r'[GDCZTKY]\d{1,2}-\d{1,2}', # G1-2等联运车次 r'\d{1,4}', # 普通车次如1234次 r'[A-Z]\d{1,4}' # Z1234等特殊车次 ], 'min_length': 1, 'max_length': 8, 'train_types': ['G', 'D', 'C', 'Z', 'T', 'K', 'Y'] # 车次类型前缀 }