-
Notifications
You must be signed in to change notification settings - Fork 0
/
bordafuse.py
93 lines (68 loc) · 3.6 KB
/
bordafuse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import argparse
from reader_file import reader
from math import sqrt
aparser = argparse.ArgumentParser(description="Process filenames")
aparser.add_argument("collection_file", nargs=1)
aparser.add_argument("output_file", nargs=1)
args = aparser.parse_args()
collection_file = args.collection_file[0]
output_file = args.output_file[0]
dictionary = reader(collection_file)
## For a file, returns dictionary ranker -> total relevant documents retrieved by the ranker
def get_stats_file(filename):
dictionary_f = reader(filename)
rel_docs_counted = dict([(i, 0) for i in range(1, 26)]) ## Contains the number of relevant documents retrieved by this system
for qid in dictionary_f.keys():
for _, rel_label, ranks in dictionary_f[qid]:
if rel_label>0:
for key in ranks.keys():
if ranks[key]!=-1:
rel_docs_counted[key] += rel_label * ranks[key] ## without multiplication -> 0.4855
else:
rel_docs_counted[key] -= 0.1 * ranks[key]
# if ranks[key] < 500: #### 4852
# rel_docs_counted[key] += 5*(rel_label) ## Promote higher rel and lower rank
# else:
# rel_docs_counted[key] += rel_label
#rel_docs_counted[key] += (rel_label/ranks[key]) #### 4835
#rel_docs_counted[key] += rel_label * ranks[key] #### 4862 ## Promote higher rel and lower rank
return rel_docs_counted
def get_weights(filenames):
rel_docs_total = dict([(i, 0) for i in range(1, 26)])
for file in filenames:
local_stats = get_stats_file(file)
for j in range(1, 26):
rel_docs_total[j] += local_stats[j]
total_rel_documents = sum(rel_docs_total)
weights_ranks = {}
for rs in range(1, 26):
weights_ranks[rs] = rel_docs_total[rs]/total_rel_documents
return weights_ranks
filenames = [collection_file
# "MQ2008-agg/Fold1/test.txt", "MQ2008-agg/Fold1/train.txt", "MQ2008-agg/Fold1/vali.txt", Add no valuable contribution
# "MQ2008-agg/Fold2/test.txt", "MQ2008-agg/Fold2/train.txt", "MQ2008-agg/Fold2/vali.txt",
# "MQ2008-agg/Fold3/test.txt", "MQ2008-agg/Fold3/train.txt", "MQ2008-agg/Fold3/vali.txt",
# "MQ2008-agg/Fold4/test.txt", "MQ2008-agg/Fold4/train.txt", "MQ2008-agg/Fold4/vali.txt",
# "MQ2008-agg/Fold5/test.txt", "MQ2008-agg/Fold5/train.txt", "MQ2008-agg/Fold5/vali.txt"
]
global_weights = get_weights(filenames)
sorted_keys = sorted(dictionary.keys())
def weightedborda(listOfDocs):
for docid, _, ranks in listOfDocs:
doc_score = 0
for ranking_mechanism in ranks.keys():
if ranks[ranking_mechanism]!= -1:
doc_score -= ranks[ranking_mechanism] * global_weights[ranking_mechanism] ## The bigger the absolute number, the worse
else :
doc_score -= 1000 * global_weights[ranking_mechanism]
scores[docid] = doc_score
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
with open(output_file, 'w') as wf:
for qid in sorted_keys:
scores = {} # For all the documents in this qid, will contain scores
ordered_scores = weightedborda(dictionary[qid])
aggrank = 1
for aggid, aggscore in ordered_scores:
wf.write(str(qid) + " Q0 " + aggid + " " + str(aggrank) + " " + str(aggscore) + " gsp1\n")
aggrank += 1
print("All done!")