fix
cloudroam
6 天以前 e6fed94443177826cf7497a85e9cdcfc7c43ee21
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import re
from typing import List, Set
 
# 脚本:校验实体格式
 
class NERDataEntryTool:
    def __init__(self):
        # 预定义的标签前缀和实体类型
        self.valid_prefixes = {'B-', 'I-', 'O'}
        self.valid_entity_types = {'POST', 'COMPANY', 'ADDRESS', 'PICKUP_CODE', 'STATION'}
        self.data_file = "data/sms_ner.txt"
        
    def validate_label(self, label: str) -> bool:
        """验证标签格式是否正确"""
        if label == 'O':
            return True
            
        if len(label) < 3:  # 标签至少要有3个字符:B-X或I-X
            return False
            
        prefix = label[:2]  # 获取前缀(B-或I-)
        entity_type = label[2:]  # 获取实体类型
        
        return prefix in self.valid_prefixes and entity_type in self.valid_entity_types
    
    def show_help(self):
        """显示帮助信息"""
        print("\n=== NER数据标注帮助 ===")
        print("有效的标签格式:")
        print("1. O (表示非实体)")
        print("2. B-实体类型 (实体开始)")
        print("3. I-实体类型 (实体继续)")
        print("\n有效的实体类型:")
        for entity_type in self.valid_entity_types:
            print(f"- {entity_type}")
        print("\n特殊命令:")
        print("- q:退出程序")
        print("- h:显示帮助")
        print("- u:撤销上一个标注")
        print("- s:跳过当前字符")
        print("- r:重新开始当前句子")
        print("================\n")
    
    def add_sentence(self) -> bool:
        """添加一个新句子的标注"""
        sentence = input("\n请输入要标注的句子(输入'q'退出):").strip()
        if sentence.lower() == 'q':
            return False
        if not sentence:
            print("句子不能为空!")
            return True
            
        current_labels: List[str] = []
        i = 0
        while i < len(sentence):
            char = sentence[i]
            print(f"\n当前句子:{sentence}")
            print(f"当前进度:{''.join(['▣' if j < i else '□' for j in range(len(sentence))])}")
            print(f"当前字符:{char}")
            
            label = input(f"请输入'{char}'的标签(h查看帮助):").strip()
            
            # 处理特殊命令
            if label.lower() == 'h':
                self.show_help()
                continue
            elif label.lower() == 'u' and current_labels:
                i = max(0, i - 1)
                current_labels.pop()
                continue
            elif label.lower() == 'r':
                return True  # 重新开始当前句子
            elif label.lower() == 's':
                i += 1
                continue
            elif label.lower() == 'q':
                return False
                
            # 验证标签
            if not self.validate_label(label):
                print(f"错误:无效的标签格式 '{label}'")
                print("请使用 'h' 查看有效的标签格式")
                continue
                
            # 保存标签
            current_labels.append(label)
            i += 1
            
        # 保存到文件
        self.save_to_file(sentence, current_labels)
        print("\n✓ 句子标注已保存")
        return True
    
    def save_to_file(self, sentence: str, labels: List[str]):
        """将标注结果保存到文件"""
        os.makedirs(os.path.dirname(self.data_file), exist_ok=True)
        
        with open(self.data_file, 'a', encoding='utf-8') as f:
            for char, label in zip(sentence, labels):
                f.write(f"{char} {label}\n")
            f.write("\n")  # 添加空行分隔句子
            
    def run(self):
        """运行数据录入工具"""
        print("=== NER数据录入工具 ===")
        print("输入 'h' 查看帮助信息")
        print("输入 'q' 退出程序")
        
        while True:
            try:
                if not self.add_sentence():
                    break
            except Exception as e:
                print(f"\n错误:{str(e)}")
                print("请重试或输入 'q' 退出")
                
        print("\n数据录入已完成!")
 
if __name__ == "__main__":
    tool = NERDataEntryTool()
    tool.run()