-
Notifications
You must be signed in to change notification settings - Fork 78
/
text_process.py
52 lines (47 loc) · 1.87 KB
/
text_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import spacy
from tqdm import tqdm
import codecs as cs
from os.path import join as pjoin
nlp = spacy.load('en_core_web_sm')
def process_text(sentence):
sentence = sentence.replace('-', '')
doc = nlp(sentence)
word_list = []
pos_list = []
for token in doc:
word = token.text
if not word.isalpha():
continue
if (token.pos_ == 'NOUN' or token.pos_ == 'VERB') and (word != 'left'):
word_list.append(token.lemma_)
else:
word_list.append(word)
pos_list.append(token.pos_)
return word_list, pos_list
def process_humanml3d(corpus):
text_save_path = './dataset/pose_data_raw/texts'
desc_all = corpus
for i in tqdm(range(len(desc_all))):
caption = desc_all.iloc[i]['caption']
start = desc_all.iloc[i]['from']
end = desc_all.iloc[i]['to']
name = desc_all.iloc[i]['new_joint_name']
word_list, pose_list = process_text(caption)
tokens = ' '.join(['%s/%s'%(word_list[i], pose_list[i]) for i in range(len(word_list))])
with cs.open(pjoin(text_save_path, name.replace('npy', 'txt')), 'a+') as f:
f.write('%s#%s#%s#%s\n'%(caption, tokens, start, end))
def process_kitml(corpus):
text_save_path = './dataset/kit_mocap_dataset/texts'
desc_all = corpus
for i in tqdm(range(len(desc_all))):
caption = desc_all.iloc[i]['desc']
start = 0.0
end = 0.0
name = desc_all.iloc[i]['data_id']
word_list, pose_list = process_text(caption)
tokens = ' '.join(['%s/%s' % (word_list[i], pose_list[i]) for i in range(len(word_list))])
with cs.open(pjoin(text_save_path, name + '.txt'), 'a+') as f:
f.write('%s#%s#%s#%s\n' % (caption, tokens, start, end))
if __name__ == "__main__":
corpus = pd.read_csv('./dataset/kit_mocap_dataset/desc_final.csv')
process_humanml3d(corpus)