import os import re from typing import List, Set # 脚本:校验实体格式 class NERDataEntryTool: def __init__(self): # 预定义的标签前缀和实体类型 self.valid_prefixes = {'B-', 'I-', 'O'} self.valid_entity_types = {'POST', 'COMPANY', 'ADDRESS', 'PICKUP_CODE', 'STATION'} self.data_file = "data/sms_ner.txt" def validate_label(self, label: str) -> bool: """验证标签格式是否正确""" if label == 'O': return True if len(label) < 3: # 标签至少要有3个字符:B-X或I-X return False prefix = label[:2] # 获取前缀(B-或I-) entity_type = label[2:] # 获取实体类型 return prefix in self.valid_prefixes and entity_type in self.valid_entity_types def show_help(self): """显示帮助信息""" print("\n=== NER数据标注帮助 ===") print("有效的标签格式:") print("1. O (表示非实体)") print("2. B-实体类型 (实体开始)") print("3. I-实体类型 (实体继续)") print("\n有效的实体类型:") for entity_type in self.valid_entity_types: print(f"- {entity_type}") print("\n特殊命令:") print("- q:退出程序") print("- h:显示帮助") print("- u:撤销上一个标注") print("- s:跳过当前字符") print("- r:重新开始当前句子") print("================\n") def add_sentence(self) -> bool: """添加一个新句子的标注""" sentence = input("\n请输入要标注的句子(输入'q'退出):").strip() if sentence.lower() == 'q': return False if not sentence: print("句子不能为空!") return True current_labels: List[str] = [] i = 0 while i < len(sentence): char = sentence[i] print(f"\n当前句子:{sentence}") print(f"当前进度:{''.join(['▣' if j < i else '□' for j in range(len(sentence))])}") print(f"当前字符:{char}") label = input(f"请输入'{char}'的标签(h查看帮助):").strip() # 处理特殊命令 if label.lower() == 'h': self.show_help() continue elif label.lower() == 'u' and current_labels: i = max(0, i - 1) current_labels.pop() continue elif label.lower() == 'r': return True # 重新开始当前句子 elif label.lower() == 's': i += 1 continue elif label.lower() == 'q': return False # 验证标签 if not self.validate_label(label): print(f"错误:无效的标签格式 '{label}'") print("请使用 'h' 查看有效的标签格式") continue # 保存标签 current_labels.append(label) i += 1 # 保存到文件 self.save_to_file(sentence, current_labels) print("\n✓ 句子标注已保存") return True def save_to_file(self, sentence: str, labels: List[str]): """将标注结果保存到文件""" os.makedirs(os.path.dirname(self.data_file), exist_ok=True) with open(self.data_file, 'a', encoding='utf-8') as f: for char, label in zip(sentence, labels): f.write(f"{char} {label}\n") f.write("\n") # 添加空行分隔句子 def run(self): """运行数据录入工具""" print("=== NER数据录入工具 ===") print("输入 'h' 查看帮助信息") print("输入 'q' 退出程序") while True: try: if not self.add_sentence(): break except Exception as e: print(f"\n错误:{str(e)}") print("请重试或输入 'q' 退出") print("\n数据录入已完成!") if __name__ == "__main__": tool = NERDataEntryTool() tool.run()