class NERConfig:
|
# 优化模型参数
|
MODEL_NAME = "bert-base-chinese"
|
MAX_LENGTH = 128 # 获取更多上下文
|
BATCH_SIZE = 4
|
EPOCHS = 10 # 增加训练轮数
|
LEARNING_RATE = 3e-5
|
WARMUP_RATIO = 0.1
|
WEIGHT_DECAY = 0.01
|
|
# 数据增强配置
|
USE_DATA_AUGMENTATION = False # 暂时关闭数据增强
|
AUGMENTATION_RATIO = 0.3
|
|
# 训练策略
|
GRADIENT_ACCUMULATION_STEPS = 4
|
EVAL_STEPS = 25 # 更频繁的验证
|
LOGGING_STEPS = 10
|
|
# 路径配置
|
DATA_PATH = "data/sms_ner.txt"
|
MODEL_PATH = "./models/ner_model"
|
LOG_PATH = "./logs_ner"
|
|
# 训练配置优化
|
SEED = 42
|
TEST_SIZE = 0.1 # 减少测试集比例,增加训练数据
|
EARLY_STOPPING_PATIENCE = 2 # 增加早停耐心值
|
|
# 交叉验证配置
|
N_SPLITS = 3 # CPU环境下减少折数
|
N_SEEDS = 1 # CPU环境下减少种子数量
|
|
# 确保标签列表完整
|
LABELS = [
|
"O",
|
"B-POST", "I-POST",
|
"B-COMPANY", "I-COMPANY",
|
"B-ADDRESS", "I-ADDRESS",
|
"B-PICKUP_CODE", "I-PICKUP_CODE", # 添加取件码标签
|
"B-STATION", "I-STATION" # 添加站点标签
|
]
|
|
# 增加训练数据质量控制
|
MAX_ENTITY_LENGTH = {
|
"PICKUP_CODE": 15, # 限制取件码最大长度
|
"ADDRESS": 50,
|
"COMPANY": 10,
|
"POST": 10,
|
"STATION": 20
|
}
|
|
# 取件码相关配置
|
PICKUP_CODE_CONFIG = {
|
'max_length': 15, # 取件码最大长度
|
'invalid_words': ['的', '来', '到', '取', '件', '码'], # 取件码中不应该出现的词
|
'valid_separators': ['-'], # 有效的分隔符
|
'min_parts': 2, # 最少分段数
|
'max_parts': 4, # 最多分段数
|
'part_length': {
|
'min': 1, # 每段最少字符数
|
'max': 4 # 每段最多字符数
|
}
|
}
|
|
class RepaymentNERConfig:
|
# 优化模型参数
|
MODEL_NAME = "bert-base-chinese"
|
MAX_LENGTH = 128
|
BATCH_SIZE = 4
|
EPOCHS = 10
|
LEARNING_RATE = 3e-5
|
WARMUP_RATIO = 0.1
|
WEIGHT_DECAY = 0.01
|
|
# 数据增强配置
|
USE_DATA_AUGMENTATION = False
|
AUGMENTATION_RATIO = 0.3
|
|
# 训练策略
|
GRADIENT_ACCUMULATION_STEPS = 4
|
EVAL_STEPS = 25
|
LOGGING_STEPS = 10
|
SAVE_STEPS = 25 # 添加保存步数
|
SAVE_TOTAL_LIMIT = 2 # 添加保存检查点数量限制
|
|
# 路径配置
|
DATA_PATH = "data/repayment.txt"
|
MODEL_PATH = "./models/repayment_model"
|
LOG_PATH = "./logs_repayment"
|
|
# 训练配置优化
|
SEED = 42
|
TEST_SIZE = 0.1
|
EARLY_STOPPING_PATIENCE = 2
|
|
# CPU环境配置
|
MAX_GRAD_NORM = 1.0
|
FP16 = False # CPU环境下关闭FP16
|
|
# CPU环境下的数据加载优化
|
DATALOADER_NUM_WORKERS = 0 # CPU环境下设为0
|
DATALOADER_PIN_MEMORY = False # CPU环境下关闭
|
|
# 交叉验证配置
|
N_SPLITS = 3
|
N_SEEDS = 1
|
|
# 标签列表
|
LABELS = [
|
"O",
|
"B-BANK", "I-BANK", # 还款主体
|
"B-TYPE", "I-TYPE", # 还款类型(花呗/信用卡/房贷等)
|
"B-PICKUP_CODE", "I-PICKUP_CODE", # 还款金额
|
"B-DATE", "I-DATE", # 还款日期
|
"B-NUMBER", "I-NUMBER", # 账号尾号
|
"B-MIN_CODE", "I-MIN_CODE" # 最低还款金额
|
]
|
|
# 金额相关配置
|
AMOUNT_CONFIG = {
|
'max_length': 20, # 增加金额最大长度
|
'invalid_words': ['共', '计', '额', '为', '是', '人民币', '¥', '¥', '元', '应还', '本期'],
|
'valid_separators': ['.', ','], # 有效的分隔符
|
'decimal_places': 2, # 小数位数
|
'max_integer_digits': 12, # 增加整数部分最大位数
|
'currency_symbols': ['¥', '¥', 'RMB', '元'], # 货币符号
|
'amount_keywords': ['金额', '应还', '欠款', '待还', '本金', '本期'],
|
'min_amount_keywords': ['最低', '最少', '至少'],
|
'decimal_context_range': 3 # 查找小数点的上下文范围
|
}
|
|
# 修改实体长度限制
|
MAX_ENTITY_LENGTH = {
|
"BANK": 15, # 增加银行名称最大长度
|
"TYPE": 10, # 增加还款类型最大长度
|
"PICKUP_CODE": 20, # 金额最大长度
|
"DATE": 15, # 增加日期最大长度
|
"NUMBER": 10, # 增加账号尾号最大长度
|
"MIN_CODE": 20 # 最低还款金额最大长度
|
}
|
|
class IncomeNERConfig:
|
# 优化模型参数 (与 RepaymentNERConfig 保持一致)
|
MODEL_NAME = "bert-base-chinese"
|
MAX_LENGTH = 128
|
BATCH_SIZE = 4
|
EPOCHS = 10
|
LEARNING_RATE = 3e-5
|
WARMUP_RATIO = 0.1
|
WEIGHT_DECAY = 0.01
|
|
# 数据增强配置
|
USE_DATA_AUGMENTATION = False
|
AUGMENTATION_RATIO = 0.3
|
|
# 训练策略
|
GRADIENT_ACCUMULATION_STEPS = 4
|
EVAL_STEPS = 25
|
LOGGING_STEPS = 10
|
SAVE_STEPS = 25
|
SAVE_TOTAL_LIMIT = 2
|
|
# 路径配置
|
DATA_PATH = "data/income.txt"
|
MODEL_PATH = "./models/income_model"
|
LOG_PATH = "./logs_income"
|
|
# 训练配置优化
|
SEED = 42
|
TEST_SIZE = 0.1
|
EARLY_STOPPING_PATIENCE = 2
|
|
# CPU环境配置
|
MAX_GRAD_NORM = 1.0
|
FP16 = False
|
|
# CPU环境下的数据加载优化
|
DATALOADER_NUM_WORKERS = 0
|
DATALOADER_PIN_MEMORY = False
|
|
# 交叉验证配置
|
N_SPLITS = 3
|
N_SEEDS = 1
|
|
# 标签列表
|
LABELS = [
|
"O",
|
"B-BANK", "I-BANK", # 银行名称
|
"B-NUMBER", "I-NUMBER", # 账号尾号
|
"B-DATATIME", "I-DATATIME", # 交易时间
|
"B-PICKUP_CODE", "I-PICKUP_CODE", # 收入金额
|
"B-BALANCE", "I-BALANCE" # 余额
|
]
|
|
# 实体长度限制
|
MAX_ENTITY_LENGTH = {
|
"BANK": 15, # 银行名称最大长度
|
"NUMBER": 10, # 账号尾号最大长度
|
"DATATIME": 20, # 时间最大长度
|
"PICKUP_CODE": 20, # 收入金额最大长度
|
"BALANCE": 20 # 余额最大长度
|
}
|
|
# 金额相关配置
|
AMOUNT_CONFIG = {
|
'max_length': 20, # 金额最大长度
|
'invalid_words': ['共', '计', '额', '为', '是', '人民币', '¥', '¥', '元'],
|
'valid_separators': ['.', ','], # 有效的分隔符
|
'decimal_places': 2, # 小数位数
|
'max_integer_digits': 12, # 整数部分最大位数
|
'currency_symbols': ['¥', '¥', 'RMB', '元'], # 货币符号
|
'decimal_context_range': 3 # 查找小数点的上下文范围
|
}
|
|
class FlightNERConfig:
|
# 优化模型参数 (与 RepaymentNERConfig 保持一致)
|
MODEL_NAME = "bert-base-chinese"
|
MAX_LENGTH = 128
|
BATCH_SIZE = 4
|
EPOCHS = 10
|
LEARNING_RATE = 3e-5
|
WARMUP_RATIO = 0.1
|
WEIGHT_DECAY = 0.01
|
|
# 训练策略
|
GRADIENT_ACCUMULATION_STEPS = 4
|
EVAL_STEPS = 25
|
LOGGING_STEPS = 10
|
SAVE_STEPS = 25
|
SAVE_TOTAL_LIMIT = 2
|
|
# 路径配置
|
DATA_PATH = "data/flight.txt"
|
MODEL_PATH = "./models/flight_model"
|
LOG_PATH = "./logs_flight"
|
|
# 训练配置
|
SEED = 42
|
TEST_SIZE = 0.1
|
EARLY_STOPPING_PATIENCE = 2
|
|
# CPU环境配置
|
MAX_GRAD_NORM = 1.0
|
FP16 = False
|
DATALOADER_NUM_WORKERS = 0
|
DATALOADER_PIN_MEMORY = False
|
|
# 交叉验证配置
|
N_SPLITS = 3
|
N_SEEDS = 3 # 增加种子数量以提高模型稳定性
|
|
# 标签列表 - 保持与需求一致
|
LABELS = [
|
"O",
|
"B-FLIGHT", "I-FLIGHT", # 航班号
|
"B-COMPANY", "I-COMPANY", # 航空公司
|
"B-START", "I-START", # 出发地
|
"B-END", "I-END", # 目的地
|
"B-DATE", "I-DATE", # 日期
|
"B-TIME", "I-TIME", # 时间
|
"B-DEPARTURE_TIME", "I-DEPARTURE_TIME", # 起飞时间
|
"B-ARRIVAL_TIME", "I-ARRIVAL_TIME", # 到达时间
|
"B-TICKET_NUM", "I-TICKET_NUM", # 机票号码
|
"B-SEAT", "I-SEAT" # 座位等信息
|
]
|
|
# 实体长度限制 - 更新键名与LABELS一致
|
MAX_ENTITY_LENGTH = {
|
"FLIGHT": 10, # 航班号
|
"COMPANY": 15, # 航空公司
|
"START": 10, # 出发地
|
"END": 10, # 目的地
|
"DATE": 15, # 日期
|
"TIME": 10, # 时间
|
"DEPARTURE_TIME": 10, # 起飞时间
|
"ARRIVAL_TIME": 10, # 到达时间
|
"TICKET_NUM": 10, # 用户姓名
|
"SEAT": 10 # 座位等信息
|
}
|
|
# 航班号配置
|
FLIGHT_CONFIG = {
|
'pattern': r'[A-Z]{2}\d{3,4}',
|
'min_length': 4,
|
'max_length': 7,
|
'carrier_codes': ['CA', 'MU', 'CZ', 'HU', '3U', 'ZH', 'FM', 'MF', 'SC', '9C'] # 常见航司代码
|
}
|
|
class TrainNERConfig:
|
# 模型参数
|
MODEL_NAME = "bert-base-chinese"
|
MAX_LENGTH = 128
|
BATCH_SIZE = 4
|
EPOCHS = 10
|
LEARNING_RATE = 3e-5
|
WARMUP_RATIO = 0.1
|
WEIGHT_DECAY = 0.01
|
|
# 训练策略
|
GRADIENT_ACCUMULATION_STEPS = 4
|
EVAL_STEPS = 25
|
LOGGING_STEPS = 10
|
SAVE_STEPS = 25
|
SAVE_TOTAL_LIMIT = 2
|
|
# 路径配置
|
DATA_PATH = "data/train.txt"
|
MODEL_PATH = "./models/train_model"
|
LOG_PATH = "./logs_train"
|
|
# 训练配置
|
SEED = 42
|
TEST_SIZE = 0.1
|
EARLY_STOPPING_PATIENCE = 2
|
|
# CPU环境配置
|
MAX_GRAD_NORM = 1.0
|
FP16 = False
|
DATALOADER_NUM_WORKERS = 0
|
DATALOADER_PIN_MEMORY = False
|
|
# 交叉验证配置
|
N_SPLITS = 3
|
N_SEEDS = 3 # 增加种子数量以提高模型稳定性
|
|
# 标签列表
|
LABELS = [
|
"O",
|
"B-COMPANY", "I-COMPANY", # 车次
|
"B-TRIPS", "I-TRIPS", # 车次
|
"B-START", "I-START", # 出发站
|
"B-END", "I-END", # 到达站
|
"B-DATE", "I-DATE", # 日期
|
"B-TIME", "I-TIME", # 时间
|
"B-SEAT", "I-SEAT", # 座位等信息
|
"B-NAME", "I-NAME" # 用户姓名
|
]
|
|
# 实体长度限制 - 更新键名与LABELS一致
|
MAX_ENTITY_LENGTH = {
|
"COMPANY": 8, # 12306
|
"TRIPS": 8, # 车次
|
"START": 10, # 出发站
|
"END": 10, # 到达站
|
"DATE": 15, # 日期
|
"TIME": 10, # 时间
|
"SEAT": 10, # 座位等信息
|
"NAME": 10 # 用户姓名
|
}
|
|
# 车次配置
|
TRIPS_CONFIG = {
|
'patterns': [
|
r'[GDCZTKY]\d{1,2}', # G1, D1, C1等
|
r'[GDCZTKY]\d{1,2}/\d{1,2}', # G1/2等联运车次
|
r'[GDCZTKY]\d{1,2}-\d{1,2}', # G1-2等联运车次
|
r'\d{1,4}', # 普通车次如1234次
|
r'[A-Z]\d{1,4}' # Z1234等特殊车次
|
],
|
'min_length': 1,
|
'max_length': 8,
|
'train_types': ['G', 'D', 'C', 'Z', 'T', 'K', 'Y'] # 车次类型前缀
|
}
|