Skip to content

Commit

Permalink
LTR feature rename + docs (#1496)
Browse files Browse the repository at this point in the history
+ feature description list
+ rename features for consistency
  • Loading branch information
stephaniewhoo committed Apr 4, 2021
1 parent f110d36 commit a2cfe15
Show file tree
Hide file tree
Showing 72 changed files with 414 additions and 436 deletions.
87 changes: 87 additions & 0 deletions docs/ltr-features.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# LTR Features
|Feature name |
|-------------------------------------------------|
|[IBM Model1](../src/main/java/io/anserini/ltr/feature/IbmModel1.java)
|[Sum of BM25](../src/main/java/io/anserini/ltr/feature/BM25Stat.java)
|Average of BM25
|Median of BM25
|Max of BM25
|Min of BM25
|MaxMinRatio of BM25
|[Sum of LMDir](../src/main/java/io/anserini/ltr/feature/LmDirStat.java)
|Average of LMDir
|Median of LMDir
|Max of LMDir
|Min of LMDir
|MaxMinRatio of LMDir
| [Sum of DFR\_GL2](../src/main/java/io/anserini/ltr/feature/DfrGl2Stat.java)
| Average of DFR\_GL2
| Median of DFR\_GL2
| Max of DFR\_GL2
| Min of DFR\_GL2
| MaxMinRatio of DFR\_GL2
| [Sum of DFR\_in\_expB2](../src/main/java/io/anserini/ltr/feature/DfrInExpB2Stat.java)
| Average of DFR\_in\_expB2
| Median of DFR\_in\_expB2
| Max of DFR\_in\_expB2
| Min of DFR\_in\_expB2
| MaxMinRatio of DFR\_in\_expB2
| [Sum of DPH](../src/main/java/io/anserini/ltr/feature/DphStat.java)
| Average of DPH
| Median of DPH
| Max of DPH
| Min of DPH
| MaxMinRatio of DPH
| [Sum of TF](../src/main/java/io/anserini/ltr/feature/TfStat.java)
| Average of TF
| Median of TF
| Max of TF
| Min of TF
| MaxMinRatio of TF
| [Sum of TFIDF](../src/main/java/io/anserini/ltr/feature/TfIdfStat.java)
| Average of TFIDF
| Median of TFIDF
| Max of TFIDF
| Min of TFIDF
| MaxMinRatio of TFIDF
| [Sum of Normalized TF](../src/main/java/io/anserini/ltr/feature/NormalizedTfStat.java)
| Average of Normalized TF
| Median of Normalized TF
| Max of Normalized TF
| Min of Normalized TF
| MaxMinRatio of Normalized TF
| [Sum of IDF](../src/main/java/io/anserini/ltr/feature/IdfStat.java)
| Average of IDF
| Median of IDF
| Max of IDF
| Min of IDF
| MaxMinRatio of IDF
| [Sum of ICTF](../src/main/java/io/anserini/ltr/feature/IcTfStat.java)
| Average of ICTF
| Median of ICTF
| Max of ICTF
| Min of ICTF
| MaxMinRatio of ICTFs
| [UnorderedSequentialPairs with gap 3](../src/main/java/io/anserini/ltr/feature/UnorderedSequentialPairs.java)
| UnorderedSequentialPairs with gap 8
| UnorderedSequentialPairs with gap 15
| [OrderedSequentialPairs with gap 3](../src/main/java/io/anserini/ltr/feature/OrderedSequentialPairs.java)
| OrderedSequentialPairs with gap 8
| OrderedSequentialPairs with gap 15
| [UnorderedQueryPairs with gap 3](../src/main/java/io/anserini/ltr/feature/UnorderedQueryPairs.java)
| UnorderedQueryPairs with gap 8
| UnorderedQueryPairs with gap 15
| [OrderedQueryPairs with gap 3](../src/main/java/io/anserini/ltr/feature/OrderedQueryPairs.java)
| OrderedQueryPairs with gap 8
| OrderedQueryPairs with gap 15
| [Normalized TFIDF](../src/main/java/io/anserini/ltr/feature/NormalizedTfIdf.java)
| [ProbabilitySum](../src/main/java/io/anserini/ltr/feature/ProbalitySum.java)
| [Proximity](../src/main/java/io/anserini/ltr/feature/Proximity.java)
| [BM25-TP score](../src/main/java/io/anserini/ltr/feature/TpScore.java)
| [TP distance](../src/main/java/io/anserini/ltr/feature/TpDist.java)
| [Doc size](../src/main/java/io/anserini/ltr/feature/DocSize.java)
| [Query Length](../src/main/java/io/anserini/ltr/feature/QueryLength.java)
| [Query Coverage Ratio](../src/main/java/io/anserini/ltr/feature/QueryCoverageRatio.java)
| [Unique Term Count in Query](../src/main/java/io/anserini/ltr/feature/UniqueTermCount.java)
| [Matching Term Count](../src/main/java/io/anserini/ltr/feature/MatchingTermCount.java)
| [SCS](../src/main/java/io/anserini/ltr/feature/SCS.java)
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.anserini.ltr.feature;
package io.anserini.ltr;

import java.util.List;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.anserini.ltr.feature;
package io.anserini.ltr;

import java.util.List;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.anserini.ltr.feature;
package io.anserini.ltr;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.anserini.ltr.feature;
package io.anserini.ltr;

import io.anserini.index.IndexArgs;
import io.anserini.index.IndexReaderUtils;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.anserini.ltr.feature;
package io.anserini.ltr;

import java.io.FileNotFoundException;
import java.io.IOException;
Expand Down
168 changes: 79 additions & 89 deletions src/main/java/io/anserini/ltr/FeatureExtractorCli.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import com.fasterxml.jackson.databind.ObjectMapper;
import io.anserini.ltr.feature.*;
import io.anserini.ltr.feature.base.*;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
Expand All @@ -31,7 +30,6 @@
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;

//../indexes/lucene-index-msmarco-passage-doc-expanded-all
public class FeatureExtractorCli {
static class DebugArgs {
@Option(name = "-index", metaVar = "[path]", required = true, usage = "Lucene index directory")
Expand Down Expand Up @@ -91,66 +89,64 @@ public static void addFeature(FeatureExtractorUtils utils, String queryField, St
* utils.add(new Proximity(docField, queryField)); utils.add(new
* TPscore(docField, queryField));
*/
utils.add(new tpDist(docField, queryField));
/*
* utils.add(new DocSize(docField)); if (queryField == "analyzed" && docField ==
* "contents") { utils.add(new QueryLength(queryField)); utils.add(new
* QueryCoverageRatio(docField, queryField)); utils.add(new
* UniqueTermCount(queryField)); }
*
* utils.add(new MatchingTermCount(docField, queryField)); utils.add(new
* SCS(docField, queryField));
*
* utils.add(new tfStat(new AvgPooler(), docField, queryField)); utils.add(new
* tfStat(new MedianPooler(), docField, queryField)); utils.add(new tfStat(new
* SumPooler(), docField, queryField)); utils.add(new tfStat(new MinPooler(),
* docField, queryField)); utils.add(new tfStat(new MaxPooler(), docField,
* queryField)); utils.add(new tfStat(new MaxMinRatioPooler(), docField,
* queryField));
*
* utils.add(new tfIdfStat(true, new AvgPooler(), docField, queryField));
* utils.add(new tfIdfStat(true, new MedianPooler(), docField, queryField));
* utils.add(new tfIdfStat(true, new SumPooler(), docField, queryField));
* utils.add(new tfIdfStat(true, new MinPooler(), docField, queryField));
* utils.add(new tfIdfStat(true, new MaxPooler(), docField, queryField));
* utils.add(new tfIdfStat(true, new MaxMinRatioPooler(), docField,
* queryField));
*
* utils.add(new normalizedTfStat(new AvgPooler(), docField, queryField));
* utils.add(new normalizedTfStat(new MedianPooler(), docField, queryField));
* utils.add(new normalizedTfStat(new SumPooler(), docField, queryField));
* utils.add(new normalizedTfStat(new MinPooler(), docField, queryField));
* utils.add(new normalizedTfStat(new MaxPooler(), docField, queryField));
* utils.add(new normalizedTfStat(new MaxMinRatioPooler(), docField,
* queryField));
*
* utils.add(new idfStat(new AvgPooler(), docField, queryField)); utils.add(new
* idfStat(new MedianPooler(), docField, queryField)); utils.add(new idfStat(new
* SumPooler(), docField, queryField)); utils.add(new idfStat(new MinPooler(),
* docField, queryField)); utils.add(new idfStat(new MaxPooler(), docField,
* queryField)); utils.add(new idfStat(new MaxMinRatioPooler(), docField,
* queryField));
*
* utils.add(new ictfStat(new AvgPooler(), docField, queryField)); utils.add(new
* ictfStat(new MedianPooler(), docField, queryField)); utils.add(new
* ictfStat(new SumPooler(), docField, queryField)); utils.add(new ictfStat(new
* MinPooler(), docField, queryField)); utils.add(new ictfStat(new MaxPooler(),
* docField, queryField)); utils.add(new ictfStat(new MaxMinRatioPooler(),
* docField, queryField));
*
* utils.add(new UnorderedSequentialPairs(3, docField, queryField));
* utils.add(new UnorderedSequentialPairs(8, docField, queryField));
* utils.add(new UnorderedSequentialPairs(15, docField, queryField));
* utils.add(new OrderedSequentialPairs(3, docField, queryField)); utils.add(new
* OrderedSequentialPairs(8, docField, queryField)); utils.add(new
* OrderedSequentialPairs(15, docField, queryField)); utils.add(new
* UnorderedQueryPairs(3, docField, queryField)); utils.add(new
* UnorderedQueryPairs(8, docField, queryField)); utils.add(new
* UnorderedQueryPairs(15, docField, queryField)); utils.add(new
* OrderedQueryPairs(3, docField, queryField)); utils.add(new
* OrderedQueryPairs(8, docField, queryField)); utils.add(new
* OrderedQueryPairs(15, docField, queryField));
*/
utils.add(new TpDist(docField, queryField));

utils.add(new DocSize(docField));
if (queryField == "analyzed" && docField == "contents"){
utils.add(new QueryLength(queryField));
utils.add(new QueryCoverageRatio(docField, queryField));
utils.add(new UniqueTermCount(queryField)); }

utils.add(new MatchingTermCount(docField, queryField));
utils.add(new SCS(docField, queryField));

utils.add(new TfStat(new AvgPooler(), docField, queryField));
utils.add(new TfStat(new MedianPooler(), docField, queryField));
utils.add(new TfStat(new SumPooler(), docField, queryField));
utils.add(new TfStat(new MinPooler(), docField, queryField));
utils.add(new TfStat(new MaxPooler(), docField, queryField));
utils.add(new TfStat(new MaxMinRatioPooler(), docField, queryField));

utils.add(new TfIdfStat(true, new AvgPooler(), docField, queryField));
utils.add(new TfIdfStat(true, new MedianPooler(), docField, queryField));
utils.add(new TfIdfStat(true, new SumPooler(), docField, queryField));
utils.add(new TfIdfStat(true, new MinPooler(), docField, queryField));
utils.add(new TfIdfStat(true, new MaxPooler(), docField, queryField));
utils.add(new TfIdfStat(true, new MaxMinRatioPooler(), docField, queryField));

utils.add(new NormalizedTfStat(new AvgPooler(), docField, queryField));
utils.add(new NormalizedTfStat(new MedianPooler(), docField, queryField));
utils.add(new NormalizedTfStat(new SumPooler(), docField, queryField));
utils.add(new NormalizedTfStat(new MinPooler(), docField, queryField));
utils.add(new NormalizedTfStat(new MaxPooler(), docField, queryField));
utils.add(new NormalizedTfStat(new MaxMinRatioPooler(), docField, queryField));

utils.add(new IdfStat(new AvgPooler(), docField, queryField));
utils.add(new IdfStat(new MedianPooler(), docField, queryField));
utils.add(new IdfStat(new SumPooler(), docField, queryField));
utils.add(new IdfStat(new MinPooler(), docField, queryField));
utils.add(new IdfStat(new MaxPooler(), docField, queryField));
utils.add(new IdfStat(new MaxMinRatioPooler(), docField, queryField));

utils.add(new IcTfStat(new AvgPooler(), docField, queryField));
utils.add(new IcTfStat(new MedianPooler(), docField, queryField));
utils.add(new IcTfStat(new SumPooler(), docField, queryField));
utils.add(new IcTfStat(new MinPooler(), docField, queryField));
utils.add(new IcTfStat(new MaxPooler(), docField, queryField));
utils.add(new IcTfStat(new MaxMinRatioPooler(), docField, queryField));

utils.add(new UnorderedSequentialPairs(3, docField, queryField));
utils.add(new UnorderedSequentialPairs(8, docField, queryField));
utils.add(new UnorderedSequentialPairs(15, docField, queryField));
utils.add(new OrderedSequentialPairs(3, docField, queryField));
utils.add(new OrderedSequentialPairs(8, docField, queryField));
utils.add(new OrderedSequentialPairs(15, docField, queryField));
utils.add(new UnorderedQueryPairs(3, docField, queryField));
utils.add(new UnorderedQueryPairs(8, docField, queryField));
utils.add(new UnorderedQueryPairs(15, docField, queryField));
utils.add(new OrderedQueryPairs(3, docField, queryField));
utils.add(new OrderedQueryPairs(8, docField, queryField));
utils.add(new OrderedQueryPairs(15, docField, queryField));

}

Expand All @@ -168,16 +164,16 @@ public static void main(String[] args) throws IOException, ExecutionException, I
}

FeatureExtractorUtils utils = new FeatureExtractorUtils(cmdArgs.indexDir, cmdArgs.threads);
// addFeature(utils, "analyzed", "contents");
// addFeature(utils, "analyzed", "predict");
// addFeature(utils, "text_unlemm", "text_unlemm");
addFeature(utils, "analyzed", "contents");
addFeature(utils, "analyzed", "predict");
addFeature(utils, "text_unlemm", "text_unlemm");
addFeature(utils, "text_bert_tok", "text_bert_tok");

// addFeature(utils,"text","text");
// addFeature(utils,"text_unlemm","text_unlemm");
// addFeature(utils,"text_bert_tok","text_bert_tok");
// System.out.println("Load IBM Models");
// utils.add(new
addFeature(utils,"text","text");
addFeature(utils,"text_unlemm","text_unlemm");
addFeature(utils,"text_bert_tok","text_bert_tok");
//System.out.println("Load IBM Models");
//utils.add(new
// IBMModel1("../FlexNeuART/collections/msmarco_doc/derived_data/giza/title_unlemm",
// "text_unlemm",
// "title_unlemm", "text_unlemm"));
Expand Down Expand Up @@ -218,7 +214,6 @@ public static void main(String[] args) throws IOException, ExecutionException, I
while (qids.size() > 0) {
lastQid = qids.remove(0);
List<debugOutput> outputArray = utils.getDebugResult(lastQid);
// System.out.println(String.format("Qid:%s\tLine:%d",lastQid,offset));
for (debugOutput res : outputArray) {
for (int i = 0; i < names.size(); i++) {
time[i] += res.time.get(i);
Expand All @@ -238,7 +233,6 @@ public static void main(String[] args) throws IOException, ExecutionException, I
while (qids.size() > 0) {
lastQid = qids.remove(0);
List<debugOutput> outputArray = utils.getDebugResult(lastQid);
// System.out.println(String.format("Qid:%s\tLine:%d",lastQid,offset));
for (debugOutput res : outputArray) {
for (int i = 0; i < names.size(); i++) {
time[i] += res.time.get(i);
Expand All @@ -251,24 +245,20 @@ public static void main(String[] args) throws IOException, ExecutionException, I
throw e;
}
}
// long executionEnd = System.nanoTime();
// long sumtime = 0;
// for(int i = 0; i < names.size(); i++){
// sumtime += time[i];
// }
// for(int i = 0; i < names.size(); i++){
// System.out.println(names.get(i)+" takes
// "+String.format("%.2f",time[i]/1000000000.0) + "s, accounts for "+
// String.format("%.2f", time[i]*100.0/sumtime) + "%");
// }
long executionEnd = System.nanoTime();
long sumtime = 0;
for(int i = 0; i < names.size(); i++){
sumtime += time[i];
}
for(int i = 0; i < names.size(); i++){
System.out.println(names.get(i)+" takes "+String.format("%.2f",time[i]/1000000000.0) + "s, accounts for "+
String.format("%.2f", time[i]*100.0/sumtime) + "%");
}
utils.close();
reader.close();
//
// long end = System.nanoTime();
// long overallTime = end - start;
// long overhead = overallTime-(executionEnd - executionStart);
// System.out.println("The program takes
// "+String.format("%.2f",overallTime/1000000000.0) + "s, where the overhead
// takes " + String.format("%.2f",overhead/1000000000.0) +"s");
long end = System.nanoTime();
long overallTime = end - start;
long overhead = overallTime-(executionEnd - executionStart);
System.out.println("The program takes "+String.format("%.2f",overallTime/1000000000.0) + "s, where the overhead takes " + String.format("%.2f",overhead/1000000000.0) +"s");
}
}
6 changes: 0 additions & 6 deletions src/main/java/io/anserini/ltr/FeatureExtractorUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,9 @@
package io.anserini.ltr;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.anserini.index.IndexArgs;
import io.anserini.ltr.feature.*;
import io.anserini.ltr.feature.base.*;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.core.tools.picocli.CommandLine;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.anserini.ltr.feature;
package io.anserini.ltr;

import java.util.List;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.anserini.ltr.feature;
package io.anserini.ltr;

import java.util.List;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.anserini.ltr.feature;
package io.anserini.ltr;

import java.util.Collections;
import java.util.List;
Expand Down
Loading

0 comments on commit a2cfe15

Please sign in to comment.