forked from stanfordmlgroup/chexpert-labeler
-
Notifications
You must be signed in to change notification settings - Fork 2
/
label.py
executable file
·111 lines (90 loc) · 4.06 KB
/
label.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""Entry-point script to label radiology reports."""
import pandas as pd
from args import ArgParser
from loader import Loader
from stages import Extractor, Classifier, Aggregator
from constants import *
import os
from glob import glob
from tqdm import tqdm
def count(reports_path):
print(f"Counting reports.")
reports = pd.read_csv(reports_path, names=['laudos'])
num_reports = reports.laudos.count()
num_batches = num_reports//5000
print(f"Found {num_reports} reports. Label in {num_batches} batches.")
return num_batches
def write(reports, labels, output_path, verbose=False):
"""Write labeled reports to specified path."""
labeled_reports = pd.DataFrame({REPORTS: reports})
for index, category in enumerate(CATEGORIES):
labeled_reports[category] = labels[:, index]
if verbose:
print(f"Writing reports and labels to {output_path}.")
labeled_reports[[REPORTS] + CATEGORIES].to_csv(output_path,
index=False)
def label(args):
"""Label the provided report(s)."""
loader = Loader(args.reports_path, args.extract_impression)
extractor = Extractor(args.mention_phrases_dir,
args.unmention_phrases_dir,
verbose=args.verbose)
classifier = Classifier(args.pre_negation_uncertainty_path,
args.negation_path,
args.post_negation_uncertainty_path,
verbose=args.verbose)
aggregator = Aggregator(CATEGORIES,
verbose=args.verbose)
n_batches = count(args.reports_path)
print(f"Numero de batches: {n_batches}.")
if n_batches == 0:
# Load reports in place.
loader.load()
# Extract observation mentions in place.
extractor.extract(loader.collection)
# Classify mentions in place.
classifier.classify(loader.collection)
# Aggregate mentions to obtain one set of labels for each report.
labels = aggregator.aggregate(loader.collection)
# Write CSV
write(loader.reports, labels, args.output_path, args.verbose)
else:
lista_output_batch_path = []
for i in range(n_batches + 1):
print(f"Batch {i}.")
#cria nome para o output deste batch
output_dirname = os.path.dirname(args.output_path)
print(output_dirname)
output_filename = os.path.basename(args.output_path)
print(output_filename)
output_batch_path = (f"{output_dirname}/{i}_batch_{output_filename}")
lista_output_batch_path.append(output_batch_path)
#verifica se já há esse arquivo no diretório output
lista_dir_out = glob("output/*.csv")
if output_batch_path in lista_dir_out:
if args.verbose:
print(f"Output do batch {i} encontrado: {output_batch_path}\nPassando para proximo batch")
continue
# Load reports in place.
loader.load(batch = i)
# Extract observation mentions in place.
extractor.extract(loader.collection)
# Classify mentions in place.
classifier.classify(loader.collection)
# Aggregate mentions to obtain one set of labels for each report.
labels = aggregator.aggregate(loader.collection)
# Write CSV for this batch.
write(loader.reports, labels, output_batch_path, args.verbose)
# le todos os CSVs intermediarios e os concatena
df_fim = pd.read_csv(lista_output_batch_path[0])
lista_batches_restantes = lista_output_batch_path[1:]
if args.verbose:
print("Agregating all batches...")
lista_batches_restantes = tqdm(lista_batches_restantes)
for o in lista_batches_restantes:
df_int = pd.read_csv(o)
df_fim = df_fim.append(df_int)
df_fim.to_csv(args.output_path, index=False)
if __name__ == "__main__":
parser = ArgParser()
label(parser.parse_args())