-
Notifications
You must be signed in to change notification settings - Fork 1
/
cwa_processing_animal.py
142 lines (111 loc) · 6.37 KB
/
cwa_processing_animal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import json
import re
animal_name = ['the bald eagle', 'the tiger', 'the bear', 'the lion', 'the wolf', 'the crocodile', 'the dinosaur',
'the snake', 'the leopard']
animal_name_1 = ['the cat', 'the dog', 'the mouse', 'the rabbit', 'the squirrel']
animal_relations = ['is', 'is not']
animal_relations_1 = ['likes', 'chases', 'needs', 'visits', 'attacks', 'sees']
# animal_relations_1_1 = {'does not like', 'does not chase', 'does not need', 'does not visit', 'does not eat'}
animal_attributes_1 = ['kind', 'quiet', 'round', 'nice', 'smart']
animal_attributes_2 = ['dull', 'rough', 'lazy', 'slow', 'sleepy']
animal_attributes_3 = ['furry', 'small', 'cute', 'lovely', 'beautiful']
animal_attributes_4 = ['big', 'strong', 'awful', 'fierce', 'heavy']
def split_sentences(text):
return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
animal_names = animal_name + animal_name_1
animal_attributes = animal_attributes_1 + animal_attributes_2 + animal_attributes_3 + animal_attributes_4
animal_relations = animal_relations + animal_relations_1
def sentence_processing(text):
# 初始化输出字典和两个空列表
facts = {}
rules_condition = [] # 存储规则的条件部分(if部分)
rules_consequence = [] # 存储规则的结论部分(then部分)
# 分割文本为句子列表
sentences = split_sentences(text)
# 遍历每个句子并进行处理
for sentence in sentences:
# 对于 "动物1 is 属性" 句型
if any(animal in sentence for animal in animal_names) and " is " in sentence and any(
attr in sentence for attr in animal_attributes) and " is not " not in sentence and "If" not in sentence:
parts = sentence.split(" is ")
animal = parts[0].strip()
attribute = parts[1].replace(".", "").strip()
if animal not in facts:
facts[animal] = []
facts[animal].append(attribute)
# 对于 "If something is 属性1 then it is 属性2" 句型
elif "If" in sentence and "then" in sentence and " is not " not in sentence and " and not " not in sentence and " and " not in sentence:
parts = sentence.split(" then ")
condition_parts = parts[0].replace("If ", "").split(" is ")
if len(condition_parts) == 2:
condition = condition_parts[1].strip() # 获取属性1
consequence = parts[1].replace("it is ", "").replace(".", "").strip() # 获取属性2
# 将属性添加到对应的列表中
rules_condition.append(condition)
rules_consequence.append(consequence)
# 对于 "If something is not 属性1 then it is 属性2" 句型
elif "If" in sentence and "then" in sentence and " is not " in sentence and " and not " not in sentence and " and " not in sentence:
parts = sentence.split(" then ")
condition = parts[0].replace("If ", "").split(" is not ")[1].split(" and ")[0].strip() # 获取属性1
consequence = parts[1].replace("it is ", "").replace(".", "").strip() # 获取属性2
# 将属性添加到对应的列表中
rules_condition.append(condition)
rules_consequence.append(consequence)
# 句型 "If something is 属性1 and not 属性2 then it is 属性3"
elif "if" in sentence.lower() and "and not" in sentence and "then" in sentence and sum(
attribute in sentence for attribute in animal_attributes) > 2:
attributes_in_sentence = [attribute for attribute in animal_attributes if attribute in sentence]
if len(attributes_in_sentence) == 3:
rules_condition.extend(attributes_in_sentence[:2])
rules_consequence.append(attributes_in_sentence[2])
# 句型 "If something is 属性1 and 属性2 then it is 属性3"
elif "if" in sentence.lower() and "and" in sentence and "then" in sentence and sum(
attribute in sentence for attribute in animal_attributes) > 2:
attributes_in_sentence = [attribute for attribute in animal_attributes if attribute in sentence]
if len(attributes_in_sentence) == 3:
rules_condition.extend(attributes_in_sentence[:2])
rules_consequence.append(attributes_in_sentence[2])
# 句型 "All 属性1 animals are 属性2"
elif "all" in sentence.lower() and "animals are" in sentence and sum(
attribute in sentence for attribute in animal_attributes) == 2:
attributes_in_sentence = [attribute for attribute in animal_attributes if attribute in sentence]
if len(attributes_in_sentence) == 2:
rules_condition.append(attributes_in_sentence[0])
rules_consequence.append(attributes_in_sentence[1])
return facts, rules_condition, rules_consequence
def find_unique_elements(list1, list2):
unique_elements = []
for element in list1:
if element not in list2:
unique_elements.append(element)
return unique_elements
def preprocessing(text):
facts, rules_condition, rules_consequence = sentence_processing(text)
sentences = ""
for name in facts.keys():
# for each words in the list
for word in find_unique_elements(rules_condition, rules_consequence):
# Generate a sentence if the word is not in the value of the current key of the dictionary
if word not in facts[name]:
sentences = sentences + (f"{name} is not {word}.")
return sentences + " " + text
if __name__ == "__main__":
json_files = [
"PARARULE_plus_step2_Animal_sample.json",
"PARARULE_plus_step3_Animal_sample.json",
"PARARULE_plus_step4_Animal_sample.json",
"PARARULE_plus_step5_Animal_sample.json",
]
# Iterate over each JSON file
for file_name in json_files:
# Load the JSON data from file
with open(file_name, 'r') as file:
data = json.load(file)
# Extract context from each data entry, preprocess, and update the data
for entry in data:
original_context = entry['context']
preprocessed_context = preprocessing(original_context)
entry['context'] = f"{preprocessed_context}"
# Write the updated data back to the JSON file
with open(file_name, 'w') as file:
json.dump(data, file, indent=4)