class NERConfig: # 优化模型参数 MODEL_NAME = "bert-base-chinese" MAX_LENGTH = 128 # 获取更多上下文 BATCH_SIZE = 4 EPOCHS = 10 # 增加训练轮数 LEARNING_RATE = 3e-5 WARMUP_RATIO = 0.1 WEIGHT_DECAY = 0.01 # 数据增强配置 USE_DATA_AUGMENTATION = False # 暂时关闭数据增强 AUGMENTATION_RATIO = 0.3 # 训练策略 GRADIENT_ACCUMULATION_STEPS = 4 EVAL_STEPS = 25 # 更频繁的验证 LOGGING_STEPS = 10 # 路径配置 DATA_PATH = "data/sms_ner.txt" MODEL_PATH = "./models/ner_model" LOG_PATH = "./logs_ner" # 训练配置优化 SEED = 42 TEST_SIZE = 0.1 # 减少测试集比例,增加训练数据 EARLY_STOPPING_PATIENCE = 2 # 增加早停耐心值 # 交叉验证配置 N_SPLITS = 3 # CPU环境下减少折数 N_SEEDS = 1 # CPU环境下减少种子数量 # 确保标签列表完整 LABELS = [ "O", "B-POST", "I-POST", "B-COMPANY", "I-COMPANY", "B-ADDRESS", "I-ADDRESS", "B-PICKUP_CODE", "I-PICKUP_CODE", # 添加取件码标签 "B-STATION", "I-STATION" # 添加站点标签 ] # 增加训练数据质量控制 MAX_ENTITY_LENGTH = { "PICKUP_CODE": 15, # 限制取件码最大长度 "ADDRESS": 50, "COMPANY": 10, "POST": 10, "STATION": 20 } # 取件码相关配置 PICKUP_CODE_CONFIG = { 'max_length': 15, # 取件码最大长度 'invalid_words': ['的', '来', '到', '取', '件', '码'], # 取件码中不应该出现的词 'valid_separators': ['-'], # 有效的分隔符 'min_parts': 2, # 最少分段数 'max_parts': 4, # 最多分段数 'part_length': { 'min': 1, # 每段最少字符数 'max': 4 # 每段最多字符数 } } class RepaymentNERConfig: # 优化模型参数 MODEL_NAME = "bert-base-chinese" MAX_LENGTH = 128 BATCH_SIZE = 4 EPOCHS = 10 LEARNING_RATE = 3e-5 WARMUP_RATIO = 0.1 WEIGHT_DECAY = 0.01 # 数据增强配置 USE_DATA_AUGMENTATION = False AUGMENTATION_RATIO = 0.3 # 训练策略 GRADIENT_ACCUMULATION_STEPS = 4 EVAL_STEPS = 25 LOGGING_STEPS = 10 SAVE_STEPS = 25 # 添加保存步数 SAVE_TOTAL_LIMIT = 2 # 添加保存检查点数量限制 # 路径配置 DATA_PATH = "data/repayment.txt" MODEL_PATH = "./models/repayment_model" LOG_PATH = "./logs_repayment" # 训练配置优化 SEED = 42 TEST_SIZE = 0.1 EARLY_STOPPING_PATIENCE = 2 # CPU环境配置 MAX_GRAD_NORM = 1.0 FP16 = False # CPU环境下关闭FP16 # CPU环境下的数据加载优化 DATALOADER_NUM_WORKERS = 0 # CPU环境下设为0 DATALOADER_PIN_MEMORY = False # CPU环境下关闭 # 交叉验证配置 N_SPLITS = 3 N_SEEDS = 1 # 标签列表 LABELS = [ "O", "B-BANK", "I-BANK", # 还款主体 "B-TYPE", "I-TYPE", # 还款类型(花呗/信用卡/房贷等) "B-PICKUP_CODE", "I-PICKUP_CODE", # 还款金额 "B-DATE", "I-DATE", # 还款日期 "B-NUMBER", "I-NUMBER", # 账号尾号 "B-MIN_CODE", "I-MIN_CODE" # 最低还款金额 ] # 金额相关配置 AMOUNT_CONFIG = { 'max_length': 20, # 增加金额最大长度 'invalid_words': ['共', '计', '额', '为', '是', '人民币', '¥', '¥', '元', '应还', '本期'], 'valid_separators': ['.', ','], # 有效的分隔符 'decimal_places': 2, # 小数位数 'max_integer_digits': 12, # 增加整数部分最大位数 'currency_symbols': ['¥', '¥', 'RMB', '元'], # 货币符号 'amount_keywords': ['金额', '应还', '欠款', '待还', '本金', '本期'], 'min_amount_keywords': ['最低', '最少', '至少'], 'decimal_context_range': 3 # 查找小数点的上下文范围 } # 修改实体长度限制 MAX_ENTITY_LENGTH = { "BANK": 15, # 增加银行名称最大长度 "TYPE": 10, # 增加还款类型最大长度 "PICKUP_CODE": 20, # 金额最大长度 "DATE": 15, # 增加日期最大长度 "NUMBER": 10, # 增加账号尾号最大长度 "MIN_CODE": 20 # 最低还款金额最大长度 } class IncomeNERConfig: # 优化模型参数 (与 RepaymentNERConfig 保持一致) MODEL_NAME = "bert-base-chinese" MAX_LENGTH = 128 BATCH_SIZE = 4 EPOCHS = 10 LEARNING_RATE = 3e-5 WARMUP_RATIO = 0.1 WEIGHT_DECAY = 0.01 # 数据增强配置 USE_DATA_AUGMENTATION = False AUGMENTATION_RATIO = 0.3 # 训练策略 GRADIENT_ACCUMULATION_STEPS = 4 EVAL_STEPS = 25 LOGGING_STEPS = 10 SAVE_STEPS = 25 SAVE_TOTAL_LIMIT = 2 # 路径配置 DATA_PATH = "data/income.txt" MODEL_PATH = "./models/income_model" LOG_PATH = "./logs_income" # 训练配置优化 SEED = 42 TEST_SIZE = 0.1 EARLY_STOPPING_PATIENCE = 2 # CPU环境配置 MAX_GRAD_NORM = 1.0 FP16 = False # CPU环境下的数据加载优化 DATALOADER_NUM_WORKERS = 0 DATALOADER_PIN_MEMORY = False # 交叉验证配置 N_SPLITS = 3 N_SEEDS = 1 # 标签列表 LABELS = [ "O", "B-BANK", "I-BANK", # 银行名称 "B-NUMBER", "I-NUMBER", # 账号尾号 "B-DATATIME", "I-DATATIME", # 交易时间 "B-PICKUP_CODE", "I-PICKUP_CODE", # 收入金额 "B-BALANCE", "I-BALANCE" # 余额 ] # 实体长度限制 MAX_ENTITY_LENGTH = { "BANK": 15, # 银行名称最大长度 "NUMBER": 10, # 账号尾号最大长度 "DATATIME": 20, # 时间最大长度 "PICKUP_CODE": 20, # 收入金额最大长度 "BALANCE": 20 # 余额最大长度 } # 金额相关配置 AMOUNT_CONFIG = { 'max_length': 20, # 金额最大长度 'invalid_words': ['共', '计', '额', '为', '是', '人民币', '¥', '¥', '元'], 'valid_separators': ['.', ','], # 有效的分隔符 'decimal_places': 2, # 小数位数 'max_integer_digits': 12, # 整数部分最大位数 'currency_symbols': ['¥', '¥', 'RMB', '元'], # 货币符号 'decimal_context_range': 3 # 查找小数点的上下文范围 }