forked from microsoft/knowledge-extraction-recipes-forms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_form_variation.py
65 lines (46 loc) · 2.31 KB
/
get_form_variation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import pandas as pd # type:ignore
from dotenv import load_dotenv
from common.common import build_deviations_file, build_max_differences_file, download
load_dotenv()
class Config:
"""
Read from .env file
"""
ANCHOR_KEYS = os.environ.get("ANCHOR_KEYS") # The fields we want to find
GROUND_TRUTH_FILE_PATH = os.environ.get("GROUND_TRUTH_FILE_PATH") # Path to your GT file
BBOX_FILE = os.environ.get("BBOX_FILE") # The generated BBOX file
DEVIATIONS_FILE = os.environ.get("DEVIATIONS_FILE") # The generated deviations file
OUTPUT_MAX_DIFF_FILE = os.environ.get("OUTPUT_MAX_DIFF_FILE") # Output name
OUTPUT_MAX_DIFF_PATH = os.environ.get("OUTPUT_MAX_DIFF_PATH") # Output path
THRESHOLD = os.environ.get("THRESHOLD") # This is the threshold defined from the std deviation
def main(argv):
"""
:param: See Config class which reads from .env file
:return: Generates max diff file
"""
# Let's load the files
dfclusterbbox = pd.read_csv(Config.BBOX_FILE)
dfclusterdev = pd.read_csv(Config.DEVIATIONS_FILE)
anchor_keys = Config.ANCHOR_KEYS.split(",")
# Here we build the deviations dataframe
for anchor_key in anchor_keys:
dfdev = build_deviations_file(dfclusterdev, Config.THRESHOLD, anchor_key)
dfdev.to_csv(Config.DEVIATIONS_FILE[:-4] + '_' + anchor_key + '.csv', sep=',')
print('Wrote mean file {}'.format(Config.OUTPUT_DEVIATIONS_FILE[:-4] +
'_' + anchor_key + '.csv'))
# Here we build the maximum differences datafile
dfdfclusterbbox_maxmin = build_max_differences_file(dfclusterbbox, anchor_keys)
dfdfclusterbbox_maxmin = dfdfclusterbbox_maxmin.sort_values('bbox_diff', ascending=False).drop_duplicates(
subset='bbox_max_file').reset_index()
dfdfclusterbbox_maxmin.to_csv(Config.OUTPUT_MAX_DIFF_FILE, sep=',')
print(f'Wrote max differences file {Config.OUTPUT_MAX_DIFF_FILE}')
box = 'bbox_line'
if not os.path.isdir(os.path.join(Config.OUTPUT_MAX_DIFF_PATH, 'max_diffs_' + box)):
os.mkdir(os.path.join(Config.OUTPUT_MAX_DIFF_PATH, 'max_diffs_' + box))
# TODO now inspect the report files
if __name__ == "__main__":
main()