-
Notifications
You must be signed in to change notification settings - Fork 2
/
template.py
196 lines (180 loc) · 10.3 KB
/
template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import click
import csv
import ruamel.yaml
from ruamel.yaml.comments import CommentedMap, CommentedSeq
from pathlib import Path
from _version import __version__
def generate_template_csv(mapping_file, output_path):
"""
Generate a template file for CDE generation from a RADx CDE mapping file.
:param mapping_file: File path to mapping file
:type mapping_file: str
:param output_path: Output path of the generated template file
:type output_path: str
"""
template = {
"row_count": 1000,
"output_path": None,
"variables": {}
}
with open(mapping_file, "r") as f:
reader = csv.DictReader(f, delimiter=",")
i = 0
for row in reader:
variable_template = []
variable_name = row["Variable"]
possible_responses = [
# Split mapping into (value, name). Important to only split the first comma, since the name can also include commas
# E.g. "0, No, not of Hispanic or Latino origin" -> ("0", "No, not of Hispanic or Latino origin")
response.split(",", 1)
# Split into individual variable mappings
for response in row["Responses"].split(";")
]
for possible_response in possible_responses:
if len(possible_response) == 2:
# Normal response mapping of [value, name]
response_value, response_name = possible_response
response_name = response_name.strip()
response_value = int(response_value)
variable_template.append({
"response_name": response_name,
"response_value": response_value,
"frequency": None
})
elif len(possible_response) == 1:
# User-input field, e.g. "text", "integer" -> user can enter any text or integer
response_type = possible_response[0].strip()
var_field = {
"response_name": response_type,
"response_value_generator": {},
"frequency": None
}
if response_type == "text":
# If lorem is specified, a lorem ipsum generator will be used.
var_field["response_value_generator"]["lorem"] = {
"num_sentences": [1, 2],
"sentence_length": [1, 10]
}
# If valid_inputs is specified, a random choice from the list will be used
var_field["response_value_generator"]["valid_inputs"] = None
if response_type == "integer":
# If range is specified, a random integer within the range will be used.
# If valid_inputs is specified, a random integer from the list will be used.
var_field["response_value_generator"]["range"] = [0, 10]
var_field["response_value_generator"]["valid_inputs"] = None
variable_template.append(var_field)
possible_responses = possible_responses[1:]
else:
raise Exception(f"Parsing response with length {len(possible_response)} is not implemented. Response: {possible_response}")
template["variables"][variable_name] = variable_template
if Path(output_path).is_file():
if not click.confirm(f"Template file already exists under \"{output_path}\". You will LOSE ALL DATA under the existing template. Continue anyways?"):
print("Cancelled template generation.")
return
with open(output_path, "w+", encoding="utf-8") as out_file:
document_comment = f"""\
Template generated using v{__version__}.
Source mapping file: "{mapping_file}".
#
{len(template['variables'])} variables and {sum([len(template['variables'][variable]) for variable in template['variables']])} possible responses.
#
Don't change any `response_name` or `response_value` values!
For responses, only `frequency` (and `response_value_generator` for text/integers) should be modified.
#
Notes:
- All responses under a variable with {{frequency: null}} will have the remaining frequency distribution divided evenly between them.
For example, a variable "nih_example" has four responses: foo{{frequency: 0.4}}, bar{{frequency: 0.3}}, egg{{frequency: null}}, spam{{frequency: null}}
At generation time, egg and spam will have the remaining 0.3 frequency divided evenly between them: egg{{frequency: 0.15}}, spam{{frequency: 0.15}}
- Special responses: `text`, `integer`.
These responses do not have a preassigned "response_value" and require extra configuration to generate such a value.
integer:
// Choose between (prioritization: range > valid_inputs)
range: [min, max] // generates a random integer in inclusive range [min, max]
valid_inputs: int[] // chooses a random integer in list.
text:
// Choose between (prioritization: lorem > valid_inputs)
lorem: // generates random lorem ipsum text
num_sentences: [min, max] // number of sentences (inclusive range)
sentence_length: [min, max] // word length (inclusive range)
valid_inputs: str[] // chooses a random string in list.
- For additional clarification on the structure of template files, refer to template_schema.json, which is the jsonschema specification
for template files.
"""
yaml = ruamel.yaml.YAML()
# Ensure that `None` is dumped as `null`.
yaml.representer.add_representer(type(None), lambda self, data: self.represent_scalar('tag:yaml.org,2002:null', 'null'))
# Makes list yaml more readable
yaml.indent(sequence=4, offset=2)
commented_yaml = CommentedMap(template)
commented_yaml.yaml_set_start_comment(document_comment, indent=0)
commented_yaml.yaml_set_comment_before_after_key(
"row_count",
"How many records of data to generate. If null, `generate.py` will expect [-n ROW_COUNT] argument to be specified.\n" \
"Ex: `row_count: 1000` will generate 1000 records of data when the template is run.",
indent=0
)
commented_yaml.yaml_set_comment_before_after_key(
"output_path",
"File name/path to output the data under (or `null` to auto-generate a name). Can be overriden by [-o OUTPUT_PATH] argument.\n" \
"Ex: `output_path: my_synthetic_cde.csv` will output the synthetic CDE under `my_synthetic_cde.csv` when the template is run.",
indent=0
)
for variable in commented_yaml["variables"]:
commented_yaml["variables"][variable] = CommentedSeq(commented_yaml["variables"][variable])
for i, response in enumerate(commented_yaml["variables"][variable]):
commented_yaml["variables"][variable][i] = CommentedMap(commented_yaml["variables"][variable][i])
for response in commented_yaml["variables"][variable]:
# response.yaml_set_comment_before_after_key("frequency", "Change this!", indent=6)
response.yaml_set_comment_before_after_key(
"response_value_generator",
"Requires special configuration.",
indent=6
)
if "response_value_generator" in response:
generator = CommentedMap(response["response_value_generator"])
response["response_value_generator"] = generator
generator.yaml_set_comment_before_after_key("lorem", "Generates pseudo-Latin text.", indent=8)
generator.yaml_set_comment_before_after_key("range", "Chooses a random integer in the inclusive range.", indent=8)
generator.yaml_set_comment_before_after_key(
"valid_inputs",
"Randomly chooses a value from the list. Ex: ['a', 'b', 'c'] or [1, 2, 3]",
indent=8
)
if "lorem" in generator:
generator["lorem"] = CommentedMap(generator["lorem"])
generator["lorem"]["num_sentences"] = CommentedSeq(generator["lorem"]["num_sentences"])
generator["lorem"]["sentence_length"] = CommentedSeq(generator["lorem"]["sentence_length"])
generator["lorem"]["num_sentences"].yaml_set_comment_before_after_key(0, "Minimum number of sentences", indent=12)
generator["lorem"]["num_sentences"].yaml_set_comment_before_after_key(1, "Maximum number of sentences", indent=12)
generator["lorem"]["sentence_length"].yaml_set_comment_before_after_key(0, "Minimum number of words per sentence", indent=12)
generator["lorem"]["sentence_length"].yaml_set_comment_before_after_key(1, "Maximum number of words per sentence", indent=12)
if "range" in generator:
generator["range"] = CommentedSeq(generator["range"])
generator["range"].yaml_set_comment_before_after_key(0, "Minimum value", indent=10)
generator["range"].yaml_set_comment_before_after_key(1, "Maximum value", indent=10)
yaml.dump(commented_yaml, out_file)
print(
f"Generated new template file under \"{output_path}\" with {len(template['variables'])} variables " \
f"and {sum([len(template['variables'][variable]) for variable in template['variables']])} possible responses."
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Create the base template file for CDE generation from a RADx CDE mapping file")
parser.add_argument(
"-m",
"--mapping_file",
help="RADx CDE mapping file, e.g. the RADx global cookbook",
action="store",
required=True
)
parser.add_argument(
"-o",
"--output_path",
help="Output path of template file",
action="store",
default="cde_template.yaml"
)
args = parser.parse_args()
mapping_file = args.mapping_file
output_path = args.output_path
generate_template_csv(mapping_file, output_path)