-
Notifications
You must be signed in to change notification settings - Fork 0
/
celeba_description_generator.py
306 lines (239 loc) · 10.6 KB
/
celeba_description_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
from args import Args
from json_manager import JSONManager
import json
from tqdm import tqdm
import os
import re
from random import randint
import itertools
# Parameters
skip_under = 10
from annotation_manager import *
def cleanup_face2text():
json_manager = JSONManager(Args.descriptions_json_path)
data = {}
for i in range(len(json_manager.data)):
image_name = json_manager.get_imagename_from_idx(i)
no_descriptions = json_manager.get_number_of_descriptions_at_idx(i)
descriptions = []
for j in range(no_descriptions):
description = json_manager.get_description_from_idx_with_idx(i, j)
try:
data[image_name].append(description)
except:
data[image_name] = [description]
with open(Args.cleanedup_lfw_descriptions, 'w') as outfile:
json.dump(data, outfile)
def generate_new_dataset_json():
"""
Generate json file of new descriptions dataset combined with old descriptions (from 5685 data)
"""
filepath = Args.uluc_generated_descriptions
dataset_5685 = Args.descriptions_json_path
savepath = Args.uluc_combined_dataset_json
# old json manager for extracting old descriptions
json_manager = JSONManager(dataset_5685)
data = {}
for i in range(len(json_manager.data)):
image_name = json_manager.get_imagename_from_idx(i)
no_descriptions = json_manager.get_number_of_descriptions_at_idx(i)
descriptions = []
for j in range(no_descriptions):
description = json_manager.get_description_from_idx_with_idx(i, j)
try:
data[image_name].append(description)
except:
data[image_name] = [description]
with open(filepath) as fp:
line = fp.readline()
while line:
item = line.split(":")
image_name = item[0]
# we should not have any image that exists both in old descriptions and new descriptions
assert image_name not in data.keys()
description = item[1]
description = description[1:] # remove unnecessary space at the beginning of each description
data[image_name] = [description] # each image has only 1 description in new descriptions so we dont need to use .append()
line = fp.readline()
with open(savepath, 'w') as outfile:
json.dump(data, outfile)
def get_annotation_dict():
result = {}
with open(Args.annotation_dict_path) as fp:
line = fp.readline()
while line:
line = line.split(":")
line[1] = line[1][0:-1] # get rid of "\n" at the end of string
result[line[0]] = line[1]
line = fp.readline()
return result
def get_annotation_sentences():
result = {}
with open(Args.annotation_sentence_parts) as fp:
line = fp.readline()
while line:
line = line.split(":")
line[1] = line[1][0:-1] # get rid of "\n" at the end of string
result[line[0]] = line[1]
line = fp.readline()
print(result)
return result
class CelebaDescriptionGenerator(object):
"""
Merge descriptions generated from annotations with original data.
If a description is present in original data, skip that image.
Only generate descriptions for images without a description.
Merge new data with old data to have a bigger dataset.
"""
lfw_data = None
annotation_dict = None
annotation_sentence_parts = None
def __init__(self, cleanup_data = False):
self.annotation_dict = get_annotation_dict()
self.annotation_sentence_parts = get_annotation_sentences()
if cleanup_data:
cleanup_face2text()
with open(Args.cleanedup_lfw_descriptions) as json_file:
self.lfw_data = json.load(json_file)
def rule_check(self, annotations):
"""
return False if there are annotations marked as 1 which doesnt make any sense
(such as blond hair and bald both marked as 1)
"""
# removed this part since partly bald people marked as bald too.
# # check hair (if bald and has hair)
# if annotations[bald] == "1" and (annotations[black_hair] == "1" or annotations[blond_hair] == "1" or annotations[brown_hair] == "1"
# or annotations[gray_hair] == "1" or annotations[straight_hair] == "1" or annotations[wavy_hair] == "1" or annotations[bangs] == "1"):
#
# return False
# check if woman and has beard (reaaally low chance)
if (not annotations[male] == "1") and (annotations[goatee] == "1" or annotations[mustache] == "1"):
return False
# if has beard and has no beard
if annotations[24] == "1" and (annotations[goatee] == "1" or annotations[mustache] == "1"):
return False
# if has no info about hair
if (not annotations[black_hair] == "1") and (not annotations[blond_hair] == "1") and (not annotations[brown_hair] == "1") and (not annotations[gray_hair] == "1"):
return False
return True
def count_ones(self, annotations):
count = 0
for item in annotations:
if item == "1":
count += 1
return count
def generate_description(self, annotations):
# info from annotations
intro = get_introduction_info(annotations)
he_she = get_gender_auxiliary(annotations)
his_her = get_gender_auxiliary2(annotations)
hair = get_hair(annotations)
face = get_face(annotations)
nose = get_nose(annotations)
beard = get_beard(annotations)
eye_area = get_eye_information(annotations, he_she, his_her)
makeup = get_makeup(annotations)
mouth_area = get_mouth_area(annotations, he_she)
# gender and hair description
combination1 = f"{intro} {hair} ."
if bangs != "":
combination1 = f"{intro} {hair} ."
# face information
combination2 = ""
if face != "":
combination2 = f" {he_she} has {face} ."
# nose
combination3 = ""
if nose != "":
combination3 = f" {he_she} has {nose} ."
# beard stuff
combination4 = ""
if beard != "" and he_she != "she": # we dont need to specify that a girl has no beard
combination4 = f" {he_she} has {beard} ."
# eye area
combination5 = ""
if eye_area != "":
combination5 = f"{eye_area}"
combination6 = f"{makeup}"
combination7 = f"{mouth_area}"
# add descriptive sentences in a random order (except introduction sentence)
middle_choices = [combination2, combination3, combination4, combination5, combination6, combination7]
middle_choice_combinations = list(itertools.permutations(middle_choices))
choice = randint(0, len(middle_choice_combinations)-1)
description = combination1 \
+ middle_choice_combinations[choice][0] \
+ middle_choice_combinations[choice][1] \
+ middle_choice_combinations[choice][2] \
+ middle_choice_combinations[choice][3] \
+ middle_choice_combinations[choice][4] \
+ middle_choice_combinations[choice][5]
description = re.sub('\s+', ' ', description)
description = re.sub('( . . )+', ' . ', description)
description = re.sub('( . . )+', ' . ', description)
return description
def execute(self, filepath):
no_male = 0.001
no_female = 0.001
desired_ratio = 1.0
with open(filepath) as fp:
line = fp.readline() # we don't need first line
annotation_names = fp.readline().split(" ") # second line has annotation names
annotation_names = annotation_names[0:-1] # remove "\n" from array
line = fp.readline()
current = 1
progress_bar = tqdm(total=198523, initial=current)
progress_bar.set_description("Generating descriptions from annotations")
# file operations
if os.path.exists(Args.uluc_generated_descriptions):
os.remove(Args.uluc_generated_descriptions)
f = open(Args.uluc_generated_descriptions, "a+", encoding="utf-8")
# some debugging variables
skipped_already_has_desc = 0
skipped_no_annons = 0
skipped_rule_check = 0
skipped_ratio_check = 0
generated = 0
skip = False
while line:
item = line.split(" ")
image_name = item[0]
# only generate descriptions for images without any description
if image_name in self.lfw_data:
# TODO: Add existing description for these files
skipped_already_has_desc += 1
skip = True
annotations = item[1:]
no_ones = self.count_ones(annotations)
if no_ones < skip_under:
skipped_no_annons += 1
skip = True
if not self.rule_check(annotations):
skipped_rule_check += 1
skip = True
m_to_f_ratio = no_male / no_female
if m_to_f_ratio < desired_ratio and annotations[20] != "1":
skipped_ratio_check += 1
skip = True
if skip:
skip = False
line = fp.readline()
current += 1
progress_bar.update(1)
continue
if annotations[20] == "1":
no_male += 1
else:
no_female += 1
description = self.generate_description(annotations)
generated += 1
f.write(f"{image_name}: {description}\n")
line = fp.readline()
current += 1
progress_bar.update(1)
f.close()
print(f"Skipped {skipped_already_has_desc} descriptions since they already had descriptions.")
print(f"Skipped {skipped_no_annons} descriptions since they had less than {skip_under} annotations marked as '1'.")
print(f"Skipped {skipped_rule_check} descriptions since their annotations made no sense (broke rules).")
print(f"Skipped {skipped_ratio_check} descriptions to preserve male to female ratio.")
print(f"Generated a dataset of descriptions with male to female ratio of {m_to_f_ratio} .")
print(f"Generated {generated} descriptions.")