import os
|
import re
|
from typing import List, Set
|
|
# 脚本:校验实体格式
|
|
class NERDataEntryTool:
|
def __init__(self):
|
# 预定义的标签前缀和实体类型
|
self.valid_prefixes = {'B-', 'I-', 'O'}
|
self.valid_entity_types = {'POST', 'COMPANY', 'ADDRESS', 'PICKUP_CODE', 'STATION'}
|
self.data_file = "data/sms_ner.txt"
|
|
def validate_label(self, label: str) -> bool:
|
"""验证标签格式是否正确"""
|
if label == 'O':
|
return True
|
|
if len(label) < 3: # 标签至少要有3个字符:B-X或I-X
|
return False
|
|
prefix = label[:2] # 获取前缀(B-或I-)
|
entity_type = label[2:] # 获取实体类型
|
|
return prefix in self.valid_prefixes and entity_type in self.valid_entity_types
|
|
def show_help(self):
|
"""显示帮助信息"""
|
print("\n=== NER数据标注帮助 ===")
|
print("有效的标签格式:")
|
print("1. O (表示非实体)")
|
print("2. B-实体类型 (实体开始)")
|
print("3. I-实体类型 (实体继续)")
|
print("\n有效的实体类型:")
|
for entity_type in self.valid_entity_types:
|
print(f"- {entity_type}")
|
print("\n特殊命令:")
|
print("- q:退出程序")
|
print("- h:显示帮助")
|
print("- u:撤销上一个标注")
|
print("- s:跳过当前字符")
|
print("- r:重新开始当前句子")
|
print("================\n")
|
|
def add_sentence(self) -> bool:
|
"""添加一个新句子的标注"""
|
sentence = input("\n请输入要标注的句子(输入'q'退出):").strip()
|
if sentence.lower() == 'q':
|
return False
|
if not sentence:
|
print("句子不能为空!")
|
return True
|
|
current_labels: List[str] = []
|
i = 0
|
while i < len(sentence):
|
char = sentence[i]
|
print(f"\n当前句子:{sentence}")
|
print(f"当前进度:{''.join(['▣' if j < i else '□' for j in range(len(sentence))])}")
|
print(f"当前字符:{char}")
|
|
label = input(f"请输入'{char}'的标签(h查看帮助):").strip()
|
|
# 处理特殊命令
|
if label.lower() == 'h':
|
self.show_help()
|
continue
|
elif label.lower() == 'u' and current_labels:
|
i = max(0, i - 1)
|
current_labels.pop()
|
continue
|
elif label.lower() == 'r':
|
return True # 重新开始当前句子
|
elif label.lower() == 's':
|
i += 1
|
continue
|
elif label.lower() == 'q':
|
return False
|
|
# 验证标签
|
if not self.validate_label(label):
|
print(f"错误:无效的标签格式 '{label}'")
|
print("请使用 'h' 查看有效的标签格式")
|
continue
|
|
# 保存标签
|
current_labels.append(label)
|
i += 1
|
|
# 保存到文件
|
self.save_to_file(sentence, current_labels)
|
print("\n✓ 句子标注已保存")
|
return True
|
|
def save_to_file(self, sentence: str, labels: List[str]):
|
"""将标注结果保存到文件"""
|
os.makedirs(os.path.dirname(self.data_file), exist_ok=True)
|
|
with open(self.data_file, 'a', encoding='utf-8') as f:
|
for char, label in zip(sentence, labels):
|
f.write(f"{char} {label}\n")
|
f.write("\n") # 添加空行分隔句子
|
|
def run(self):
|
"""运行数据录入工具"""
|
print("=== NER数据录入工具 ===")
|
print("输入 'h' 查看帮助信息")
|
print("输入 'q' 退出程序")
|
|
while True:
|
try:
|
if not self.add_sentence():
|
break
|
except Exception as e:
|
print(f"\n错误:{str(e)}")
|
print("请重试或输入 'q' 退出")
|
|
print("\n数据录入已完成!")
|
|
if __name__ == "__main__":
|
tool = NERDataEntryTool()
|
tool.run()
|