From e19755b5fa976127830597bc9fbca203b9f5ad24 Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Tue, 10 Nov 2020 07:47:24 -0500 Subject: [PATCH] Refactor Solr/ES scripts + docs (#1403) --- docs/elastirini.md | 71 +++++++------ docs/solrini.md | 95 +++++++++-------- src/main/python/run_es_regression.py | 138 ++++++++++++++----------- src/main/python/run_solr_regression.py | 129 +++++++++++++---------- 4 files changed, 241 insertions(+), 192 deletions(-) diff --git a/docs/elastirini.md b/docs/elastirini.md index b4c821e932..ce11b5242d 100644 --- a/docs/elastirini.md +++ b/docs/elastirini.md @@ -43,7 +43,8 @@ sh target/appassembler/bin/IndexCollection -collection TrecCollection -generator -es -es.index robust04 -threads 16 -input /path/to/disk45 -storePositions -storeDocvectors -storeRaw ``` -We can then run the following command to replicate Anserini BM25 retrieval: +We may need to wait a few minutes after indexing for the index to "catch up" before performing retrieval, otherwise the evaluation metrics may be off. +Run the following command to replicate Anserini BM25 retrieval: ```bash sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index robust04 \ @@ -54,78 +55,78 @@ sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index robust04 \ To evaluate effectiveness: ```bash -$ eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.robust04.txt runs/run.es.robust04.bm25.topics.robust04.txt +$ tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.robust04.txt runs/run.es.robust04.bm25.topics.robust04.txt map all 0.2531 P_30 all 0.3102 ``` -## Indexing and Retrieval: MS MARCO Passage +## Indexing and Retrieval: Core18 -We can replicate the [BM25 Baselines on MS MARCO (Passage)](experiments-msmarco-passage.md) results in a similar way. -First, set up the proper schema using [this config](../src/main/resources/elasticsearch/index-config.msmarco-passage.json): +We can replicate the [TREC Washington Post Corpus](regressions-core18.md) results in a similar way. +First, set up the proper schema using [this config](../src/main/resources/elasticsearch/index-config.core18.json): ```bash -cat src/main/resources/elasticsearch/index-config.msmarco-passage.json \ - | curl --user elastic:changeme -XPUT -H 'Content-Type: application/json' 'localhost:9200/msmarco-passage' -d @- +cat src/main/resources/elasticsearch/index-config.core18.json \ + | curl --user elastic:changeme -XPUT -H 'Content-Type: application/json' 'localhost:9200/core18' -d @- ``` Indexing: ```bash -sh target/appassembler/bin/IndexCollection -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ - -es -es.index msmarco-passage -threads 9 -input /path/to/msmarco-passage -storePositions -storeDocvectors -storeRaw +sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection -generator WashingtonPostGenerator \ + -es -es.index core18 -threads 8 -input /path/to/WashingtonPost -storePositions -storeDocvectors -storeContents ``` -We may need to wait a few minutes after indexing for the index to catch up before performing retrieval, otherwise wrong evaluation metrics are returned. +We may need to wait a few minutes after indexing for the index to "catch up" before performing retrieval, otherwise the evaluation metrics may be off. Retrieval: ```bash -sh target/appassembler/bin/SearchElastic -topicreader TsvString -es.index msmarco-passage \ - -topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt -output runs/run.es.msmacro-passage.txt +sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index core18 \ + -topics src/main/resources/topics-and-qrels/topics.core18.txt \ + -output runs/run.es.core18.bm25.topics.core18.txt ``` Evaluation: ```bash -$ ./eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 -m map src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.es.msmacro-passage.txt -map all 0.1956 -recall_1000 all 0.8573 +$ tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.es.core18.bm25.topics.core18.txt +map all 0.2495 +P_30 all 0.3567 ``` -## Indexing and Retrieval: Core18 +## Indexing and Retrieval: MS MARCO Passage -We can replicate the [TREC Washington Post Corpus](regressions-core18.md) results in a similar way. -First, set up the proper schema using [this config](../src/main/resources/elasticsearch/index-config.core18.json): +We can replicate the [BM25 Baselines on MS MARCO (Passage)](experiments-msmarco-passage.md) results in a similar way. +First, set up the proper schema using [this config](../src/main/resources/elasticsearch/index-config.msmarco-passage.json): ```bash -cat src/main/resources/elasticsearch/index-config.core18.json \ - | curl --user elastic:changeme -XPUT -H 'Content-Type: application/json' 'localhost:9200/core18' -d @- +cat src/main/resources/elasticsearch/index-config.msmarco-passage.json \ + | curl --user elastic:changeme -XPUT -H 'Content-Type: application/json' 'localhost:9200/msmarco-passage' -d @- ``` Indexing: ```bash -sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection -generator WashingtonPostGenerator \ - -es -es.index core18 -threads 8 -input /path/to/WashingtonPost -storePositions -storeDocvectors -storeContents +sh target/appassembler/bin/IndexCollection -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ + -es -es.index msmarco-passage -threads 9 -input /path/to/msmarco-passage -storePositions -storeDocvectors -storeRaw ``` -We may need to wait a few minutes after indexing for the index to catch up before performing retrieval, otherwise wrong evaluation metrics are returned. +We may need to wait a few minutes after indexing for the index to "catch up" before performing retrieval, otherwise the evaluation metrics may be off. Retrieval: ```bash -sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index core18 \ - -topics src/main/resources/topics-and-qrels/topics.core18.txt \ - -output runs/run.es.core18.bm25.topics.core18.txt +sh target/appassembler/bin/SearchElastic -topicreader TsvString -es.index msmarco-passage \ + -topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt -output runs/run.es.msmacro-passage.txt ``` Evaluation: ```bash -$ eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.es.core18.bm25.topics.core18.txt -map all 0.2495 -P_30 all 0.3567 +$ tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 -m map src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.es.msmacro-passage.txt +map all 0.1956 +recall_1000 all 0.8573 ``` ## Indexing and Retrieval: MS MARCO Document @@ -145,7 +146,7 @@ sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection -gene -es -es.index msmarco-doc -threads 1 -input /path/to/msmarco-doc -storePositions -storeDocvectors -storeRaw ``` -We may need to wait a few minutes after indexing for the index to catch up before performing retrieval, otherwise wrong evaluation metrics are returned. +We may need to wait a few minutes after indexing for the index to "catch up" before performing retrieval, otherwise the evaluation metrics may be off. Retrieval: @@ -159,14 +160,14 @@ This can take potentially longer than `SearchCollection` with Lucene indexes. Evaluation: ```bash -$ ./eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 -m map src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.es.msmacro-doc.txt +$ tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 -m map src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.es.msmacro-doc.txt map all 0.2308 recall_1000 all 0.8856 ``` ## Elasticsearch Integration Test -We have an end-to-end integration testing script `run_es_regression.py` for [Core18](regressions-core18.md), [Robust04](regressions-robust04.md), [MS MARCO passage](regressions-msmarco-passage.md) and [MS MARCO document](regressions-msmarco-doc.md). Its functionalities are described below. +We have an end-to-end integration testing script `run_es_regression.py` for [Robust04](regressions-robust04.md), [Core18](regressions-core18.md), [MS MARCO passage](regressions-msmarco-passage.md) and [MS MARCO document](regressions-msmarco-doc.md): ``` # Check if Elasticsearch server is on @@ -186,12 +187,14 @@ python src/main/python/run_es_regression.py --evaluate [collection] python src/main/python/run_es_regression.py --regression [collection] --input [directory] ``` +For the `collection` meta-parameter, use `robust04`, `core18`, `msmarco-passage`, or `msmarco-doc`, for each of the collections above, respectively. + ## Replication Log + Results replicated by [@nikhilro](https://github.com/nikhilro) on 2020-01-26 (commit [`d5ee069`](https://github.com/castorini/anserini/commit/d5ee069399e6a306d7685bda756c1f19db721156)) for both [MS MARCO Passage](experiments-msmarco-passage.md) and [Robust04](regressions-robust04.md) + Results replicated by [@edwinzhng](https://github.com/edwinzhng) on 2020-01-26 (commit [`7b76dfb`](https://github.com/castorini/anserini/commit/7b76dfbea7e0c01a3a5dc13e74f54852c780ec9b)) for both [MS MARCO Passage](experiments-msmarco-passage.md) and [Robust04](regressions-robust04.md) -+ Results replicated by [@HangCui0510](https://github.com/HangCui0510) on 2020-04-29 (commit [`07a9b05`](https://github.com/castorini/anserini/commit/07a9b053173637e15be79de4e7fce4d5a93d04fe)) for [MS Marco Passage](regressions-msmarco-passage.md), [Robust04](regressions-robust04.md) and [core18](regressions-core18.md) using end-to-end [`run_es_regression`](../src/main/python/run_es_regression.py) ++ Results replicated by [@HangCui0510](https://github.com/HangCui0510) on 2020-04-29 (commit [`07a9b05`](https://github.com/castorini/anserini/commit/07a9b053173637e15be79de4e7fce4d5a93d04fe)) for [MS Marco Passage](regressions-msmarco-passage.md), [Robust04](regressions-robust04.md) and [Core18](regressions-core18.md) using end-to-end [`run_es_regression`](../src/main/python/run_es_regression.py) + Results replicated by [@shaneding](https://github.com/shaneding) on 2020-05-25 (commit [`1de3274`](https://github.com/castorini/anserini/commit/1de3274b057a63382534c5277ffcd772c3fc0d43)) for [MS Marco Passage](regressions-msmarco-passage.md) + Results replicated by [@adamyy](https://github.com/adamyy) on 2020-05-29 (commit [`94893f1`](https://github.com/castorini/anserini/commit/94893f170e047d77c3ef5b8b995d7fbdd13f4298)) for [MS MARCO Passage](regressions-msmarco-passage.md), [MS MARCO Document](experiments-msmarco-doc.md) + Results replicated by [@YimingDou](https://github.com/YimingDou) on 2020-05-29 (commit [`2947a16`](https://github.com/castorini/anserini/commit/2947a1622efae35637b83e321aba8e6fccd43489)) for [MS MARCO Passage](regressions-msmarco-passage.md) -+ Results replicated by [@yxzhu16](https://github.com/yxzhu16) on 2020-07-17 (commit [`fad12be`](https://github.com/castorini/anserini/commit/fad12be2e37a075100707c3a674eb67bc0aa57ef)) for [Robust04](regressions-robust04.md), [core18](regressions-core18.md), and [MS MARCO Passage](regressions-msmarco-passage.md) ++ Results replicated by [@yxzhu16](https://github.com/yxzhu16) on 2020-07-17 (commit [`fad12be`](https://github.com/castorini/anserini/commit/fad12be2e37a075100707c3a674eb67bc0aa57ef)) for [Robust04](regressions-robust04.md), [Core18](regressions-core18.md), and [MS MARCO Passage](regressions-msmarco-passage.md) diff --git a/docs/solrini.md b/docs/solrini.md index 5f4f64221f..6e26bb075d 100644 --- a/docs/solrini.md +++ b/docs/solrini.md @@ -43,14 +43,16 @@ To set the schema, we can make a request to the Schema API: curl -X POST -H 'Content-type:application/json' --data-binary @src/main/resources/solr/schemas/SCHEMA_NAME.json http://localhost:8983/solr/COLLECTION_NAME/schema ``` +For Robust04 example below, this isn't necessary. + ## Indexing into SolrCloud from Anserini -We can use Anserini as a common "frontend" for indexing into SolrCloud, thus supporting the same range of test collections that's already included in Anserini (when directly building local Lucene indexes). +We can use Anserini as a common "front-end" for indexing into SolrCloud, thus supporting the same range of test collections that's already included in Anserini (when directly building local Lucene indexes). Indexing into Solr is similar indexing to disk with Lucene, with a few added parameters. Most notably, we replace the `-index` parameter (which specifies the Lucene index path on disk) with Solr parameters. Alternatively, Solr can also be configured to [read prebuilt Lucene index](#solr-with-prebuilt-lucene-index), since Solr uses Lucene indexes under the hood. -We'll index [robust04](regressions-robust04.md) as an example. +We'll index [Robust04](regressions-robust04.md) as an example. First, create the `robust04` collection in Solr: ``` @@ -61,12 +63,12 @@ Run the Solr indexing command for `robust04`: ``` sh target/appassembler/bin/IndexCollection -collection TrecCollection -generator DefaultLuceneDocumentGenerator \ - -threads 8 -input /path/to/robust04 \ + -threads 8 -input /path/to/disk45 \ -solr -solr.index robust04 -solr.zkUrl localhost:9983 \ -storePositions -storeDocvectors -storeRaw ``` -Make sure `/path/to/robust04` is updated with the appropriate path. +Make sure `/path/to/disk45` is updated with the appropriate path for the Robust04 collection. Once indexing has completed, you should be able to query `robust04` from the Solr [query interface](http://localhost:8983/solr/#/robust04/query). @@ -76,102 +78,105 @@ You can also run the following command to replicate Anserini BM25 retrieval: sh target/appassembler/bin/SearchSolr -topicreader Trec \ -solr.index robust04 -solr.zkUrl localhost:9983 \ -topics src/main/resources/topics-and-qrels/topics.robust04.txt \ - -output run.solr.robust04.bm25.topics.robust04.txt + -output runs/run.solr.robust04.bm25.topics.robust04.txt ``` Evaluation can be performed using `trec_eval`: -``` -eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.robust04.txt run.solr.robust04.bm25.topics.robust04.txt +```bash +$ tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 src/main/resources/topics-and-qrels/qrels.robust04.txt runs/run.solr.robust04.bm25.topics.robust04.txt +map all 0.2531 +P_30 all 0.3102 ``` -These instructions can be straightforwardly adapted to work with the [TREC Washington Post Corpus](regressions-core18.md): - -``` -sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection -generator WapoGenerator \ - -threads 8 -input /path/to/WashingtonPost \ - -solr -solr.index core18 -solr.zkUrl localhost:9983 \ - -storePositions -storeDocvectors -storeContents -``` +Solrini has also been verified to work with following collections as well: -Make sure `core18` collection is created and `/path/to/WashingtonPost` is updated with the appropriate path. ++ [TREC Washington Post Corpus](regressions-core18.md) ++ [MS MARCO Passage Retrieval Corpus](experiments-msmarco-passage.md) ++ [MS MARCO document](regressions-msmarco-doc.md) -Solrini has also been verified to work with the [MS MARCO Passage Retrieval Corpus](experiments-msmarco-passage.md). -There should be no major issues with other collections that are supported by Anserini, but we have not tested them. +See `run_solr_regression.py` regression script for more details. -## Solr with Prebuilt Lucene Index +## Solr with a Pre-built Lucene Index -Solr can be considered a front-end for Lucene, and it is entirely possible for Solr to read prebuilt Lucene indexes. -To achieve this, some housekeeping are required. +It is possible for Solr to read pre-built Lucene indexes. +To achieve this, some housekeeping is required to "install" the pre-built indexes. The following uses [Robust04](regressions-robust04.md) as an example. -Assuming your index files are stored under `indexes/robust04/lucene-index.robust04.pos+docvectors+rawdocs/`. +Let's assume the pre-built index is stored at `indexes/lucene-index.robust04.pos+docvectors+raw/`. First, a Solr collection must be created to house the index. -Here we create a collection `robust04` with configset `anserini`. +Here, we create a collection `robust04` with configset `anserini`. ``` solrini/bin/solr create -n anserini -c robust04 ``` Along with the collection, Solr will create a core instance, whose name can be found in the Solr UI under collection overview. -It might look something like `_shard_replica_` (e.g., `robust04_shard1_replica_n1`). +It'll look something like `_shard_replica_` (e.g., `robust04_shard1_replica_n1`). Solr stores configurations and data for the core instances under Solr home, which for us is `solrini/server/solr/` by default. -Second, make proper Solr schema adjustments if required. -Here `robust04` is a TREC collection whose schema is already taken care of by [managed-schema](https://github.com/castorini/anserini/blob/master/src/main/resources/solr/anserini/conf/managed-schema) in the Solr configset. -However, if you are dealing with a collection such as `cord19`, remember to make proper adjustments to the Solr schema, as [previously described](#setting-up-a-single-node-solrcloud-instance). +Second, make proper Solr schema adjustments if necessary. +Here, `robust04` is a TREC collection whose schema is already handled by [managed-schema](https://github.com/castorini/anserini/blob/master/src/main/resources/solr/anserini/conf/managed-schema) in the Solr configset. +However, for a collection such as `cord19`, remember to make proper adjustments to the Solr schema (also see above): ``` curl -X POST -H 'Content-type:application/json' --data-binary @src/main/resources/solr/schemas/SCHEMA_NAME.json http://localhost:8983/solr/COLLECTION_NAME/schema ``` -Then, copy/move the index files to where Solr expected. -As previously established, Solr stores its index data in a directory called `/data` under the core’s instance directory (`solrini/server/solr//data`). -You can simply copy your Lucene index files to `/data/index` and Solr will be able to pick them up from there. +Finally, we can copy the pre-built index to the local where Solr expects them. +Start by removing data that's there: ``` -cp indexes/robust04/lucene-index.robust04.pos+docvectors+rawdocs/* solrini/server/solr/robust04_shard1_replica_n1/data/index +rm solrini/server/solr/robust04_shard1_replica_n1/data/index/* ``` -Lastly, restart Solr to make sure changes are effective. +Then, simply copy the pre-built Lucene indexes into that location: + +``` +cp indexes/lucene-index.robust04.pos+docvectors+raw/* solrini/server/solr/robust04_shard1_replica_n1/data/index +``` + +Restart Solr to make sure changes take effect: ``` solrini/bin/solr stop solrini/bin/solr start -c -m 8G ``` +You can confirm that everything works by performing a retrieval run and checking the results (see above). + ## Solr integration test We have an end-to-end integration testing script `run_solr_regression.py`. -See example usage for [`core18`](regressions-core18.md) below: +See example usage for [Robust04](regressions-robust04.md) below: ```bash # Check if Solr server is on python src/main/python/run_solr_regression.py --ping -# Check if core18 exists -python src/main/python/run_solr_regression.py --check-index-exists core18 +# Check if robust04 exists +python src/main/python/run_solr_regression.py --check-index-exists robust04 -# Create core18 if it does not exist -python src/main/python/run_solr_regression.py --create-index core18 +# Create robust04 if it does not exist +python src/main/python/run_solr_regression.py --create-index robust04 -# Delete core18 if it exists -python src/main/python/run_solr_regression.py --delete-index core18 +# Delete robust04 if it exists +python src/main/python/run_solr_regression.py --delete-index robust04 -# Insert documents from /path/to/WashingtonPost into core18 -python src/main/python/run_solr_regression.py --insert-docs core18 --input /path/to/WashingtonPost +# Insert documents from /path/to/disk45 into robust04 +python src/main/python/run_solr_regression.py --insert-docs core18 --input /path/to/disk45 -# Search and evaluate on core18 -python src/main/python/run_solr_regression.py --evaluate core18 +# Search and evaluate on robust04 +python src/main/python/run_solr_regression.py --evaluate robust04 ``` To run end-to-end, issue the following command: ```bash -python src/main/python/run_solr_regression.py --regression core18 --input /path/to/WashingtonPost +python src/main/python/run_solr_regression.py --regression robust04 --input /path/to/disk45 ``` -The regression script has been verified to work for [`robust04`](regressions-robust04.md), [`core18`](regressions-core18.md), and [`msmarco-passage`](experiments-msmarco-passage.md). +The regression script has been verified to work for [`robust04`](regressions-robust04.md), [`core18`](regressions-core18.md), [`msmarco-passage`](experiments-msmarco-passage.md), [`msmarco-doc`](regressions-msmarco-doc.md). ## Replication Log diff --git a/src/main/python/run_es_regression.py b/src/main/python/run_es_regression.py index 5f6e67d09a..c370251fab 100644 --- a/src/main/python/run_es_regression.py +++ b/src/main/python/run_es_regression.py @@ -1,19 +1,18 @@ -# -*- coding: utf-8 -*- -''' -Anserini: A Lucene toolkit for replicable information retrieval research - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -''' +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import argparse import logging @@ -21,6 +20,7 @@ import os import requests import time + import regression_utils # Note that this class is specifically written with REST API requests instead of the @@ -32,16 +32,20 @@ logger.addHandler(ch) logger.setLevel(logging.INFO) + class ElasticsearchClient: def __init__(self): pass - def is_alive(self): + @staticmethod + def is_alive(): try: response = requests.get('http://localhost:9200/') response.raise_for_status() - except: return False - else: return True + except requests.exceptions.RequestException: + return False + else: + return True def does_index_exist(self, collection): # Make sure ES is alive: @@ -49,9 +53,12 @@ def does_index_exist(self, collection): try: response = requests.get('http://localhost:9200/{}'.format(collection)) response.raise_for_status() - except: return False - else: return True - else: raise Exception('ES does not appear to be alive!') + except requests.exceptions.RequestException: + return False + else: + return True + else: + raise Exception('ES does not appear to be alive!') def delete_index(self, collection): logger.info('Deleting index {}...'.format(collection)) @@ -60,9 +67,12 @@ def delete_index(self, collection): try: response = requests.request('DELETE', url='http://localhost:9200/{}'.format(collection)) response.raise_for_status() - except: return False - else: return True - else: raise Exception('The index {} does not exist!'.format(collection)) + except requests.exceptions.RequestException: + return False + else: + return True + else: + raise Exception('The index {} does not exist!'.format(collection)) def create_index(self, collection): logger.info('Creating index {}...'.format(collection)) @@ -74,11 +84,12 @@ def create_index(self, collection): logger.info('Using index config for {} at {}'.format(collection, filename)) with open(filename, mode='r') as file: json = file.read() + response = '' try: response = requests.request('PUT', url='http://localhost:9200/{}'.format(collection), data=json, headers={'Content-type': 'application/json'}) response.raise_for_status() - except: + except requests.exceptions.RequestException: logger.info(response) return False else: @@ -93,7 +104,6 @@ def insert_docs(self, collection, path): if not self.does_index_exist(collection): raise Exception('The index {} does not exist!'.format(collection)) # TODO: abstract this into an external config instead of hard-coded. - command = '' if collection == 'robust04': command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \ '-generator DefaultLuceneDocumentGenerator -es -es.index robust04 -threads 16 -input ' + \ @@ -107,9 +117,9 @@ def insert_docs(self, collection, path): '-generator WashingtonPostGenerator -es -es.index core18 -threads 8 -input ' + \ path + ' -storePositions -storeDocvectors -storeContents' elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-doc -threads 1 -input ' + \ - path + ' -storePositions -storeDocvectors -storeRaw' + command = 'sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection ' + \ + '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-doc -threads 1 -input ' + \ + path + ' -storePositions -storeDocvectors -storeRaw' else: raise Exception('Unknown collection: {}'.format(collection)) logger.info('Running indexing command: ' + command) @@ -119,7 +129,6 @@ def evaluate(self, collection): if not self.does_index_exist(collection): raise Exception('The index {} does not exist!'.format(collection)) # TODO: abstract this into an external config instead of hard-coded. - command = '' if collection == 'robust04': command = 'sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index robust04 ' + \ '-topics src/main/resources/topics-and-qrels/topics.robust04.txt ' + \ @@ -133,28 +142,30 @@ def evaluate(self, collection): '-topics src/main/resources/topics-and-qrels/topics.core18.txt ' + \ '-output runs/run.es.core18.bm25.topics.core18.txt' elif collection == 'msmarco-doc': - command = 'sh target/appassembler/bin/SearchElastic -topicreader TsvInt -es.index msmarco-doc ' + \ - '-topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \ - '-output runs/run.es.msmarco-doc.txt' + command = 'sh target/appassembler/bin/SearchElastic -topicreader TsvInt -es.index msmarco-doc ' + \ + '-topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \ + '-output runs/run.es.msmarco-doc.txt' else: raise Exception('Unknown collection: {}'.format(collection)) logger.info('Retrieval command: ' + command) - output = regression_utils.run_shell_command(command, logger, echo=True) + regression_utils.run_shell_command(command, logger, echo=True) logger.info('Retrieval complete!') if collection == 'robust04': - command = 'eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.robust04.txt runs/run.es.robust04.bm25.topics.robust04.txt' + command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ + 'src/main/resources/topics-and-qrels/qrels.robust04.txt ' + \ + 'runs/run.es.robust04.bm25.topics.robust04.txt' elif collection == 'msmarco-passage': - command = 'eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.es.msmarco-passage.txt' + command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ + 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ' + \ + 'runs/run.es.msmarco-passage.txt' elif collection == 'core18': - command = 'eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ + command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ 'src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.es.core18.bm25.topics.core18.txt' elif collection == 'msmarco-doc': - command = 'eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.es.msmarco-doc.txt' + command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ + 'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.es.msmarco-doc.txt' else: raise Exception('Unknown collection: {}'.format(collection)) @@ -162,28 +173,37 @@ def evaluate(self, collection): output = regression_utils.run_shell_command(command, logger, capture=True) ap = float(output[0].split('\t')[2]) - expected = 0 - if collection == 'robust04': expected = 0.2531 - elif collection == 'msmarco-passage': expected = 0.1956 - elif collection == 'core18': expected = 0.2495 - elif collection == 'msmarco-doc': expected = 0.2308 - else: raise Exception('Unknown collection: {}'.format(collection)) - - if math.isclose(ap, expected): logger.info('[SUCESS] {} MAP verified as expected!'.format(ap)) - else: logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected)) + if collection == 'robust04': + expected = 0.2531 + elif collection == 'msmarco-passage': + expected = 0.1956 + elif collection == 'core18': + expected = 0.2495 + elif collection == 'msmarco-doc': + expected = 0.2308 + else: + raise Exception('Unknown collection: {}'.format(collection)) + if math.isclose(ap, expected): + logger.info('[SUCESS] {} MAP verified as expected!'.format(ap)) + else: + logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected)) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Program for running Elasticsearch regressions.') - parser.add_argument('--ping', action='store_true', default=False, help='ping ES and exit') - parser.add_argument('--check-index-exists', default='', type=str, metavar='collection', help='check if index exists') - parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='deletes index') - parser.add_argument('--create-index', default='', type=str, metavar='collection', help='creates index') - parser.add_argument('--insert-docs', default='', type=str, metavar='collection', help='insert documents into index') - parser.add_argument('--input', default='', type=str, metavar='directory', help='location of documents to insert into index') - parser.add_argument('--evaluate', default='', type=str, metavar='collection', help='search and evaluate on collection') - parser.add_argument('--regression', default='', type=str, metavar='collection', help='run end-to-end regression') + parser.add_argument('--ping', action='store_true', default=False, help='Ping ES and exit.') + parser.add_argument('--check-index-exists', default='', type=str, metavar='collection', + help='Check if index exists.') + parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='Delete index.') + parser.add_argument('--create-index', default='', type=str, metavar='collection', help='Create index.') + parser.add_argument('--insert-docs', default='', type=str, metavar='collection', + help='Insert documents into index.') + parser.add_argument('--input', default='', type=str, metavar='directory', + help='Location of documents to insert into index.') + parser.add_argument('--evaluate', default='', type=str, metavar='collection', + help='Search and evaluate on collection.') + parser.add_argument('--regression', default='', type=str, metavar='collection', help='Run end-to-end regression.') args = parser.parse_args() es = ElasticsearchClient() diff --git a/src/main/python/run_solr_regression.py b/src/main/python/run_solr_regression.py index 632879cccf..6b29f43645 100644 --- a/src/main/python/run_solr_regression.py +++ b/src/main/python/run_solr_regression.py @@ -1,26 +1,25 @@ -# -*- coding: utf-8 -*- -''' -Anserini: A Lucene toolkit for replicable information retrieval research - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -''' +# +# Pyserini: Python interface to the Anserini IR toolkit built on Lucene +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import argparse import logging import math import os import requests -import time + import regression_utils logger = logging.getLogger('run_solr_regression') @@ -29,16 +28,20 @@ logger.addHandler(ch) logger.setLevel(logging.INFO) + class SolrClient: def __init__(self): pass - def is_alive(self): + @staticmethod + def is_alive(): try: response = requests.get('http://localhost:8983/') response.raise_for_status() - except: return False - else: return True + except requests.exceptions.RequestException: + return False + else: + return True def does_index_exist(self, collection): # Make sure Solr is alive: @@ -46,19 +49,22 @@ def does_index_exist(self, collection): try: response = requests.get('http://localhost:8983/solr/admin/collections?action=LIST') response.raise_for_status() - except: return False + except requests.exceptions.RequestException: + return False else: return collection in response.json()['collections'] - else: raise Exception('Solr does not appear to be alive!') + else: + raise Exception('Solr does not appear to be alive!') def delete_index(self, collection): # Make sure the index exists: if self.does_index_exist(collection): command = 'solrini/bin/solr delete -c {}'.format(collection) logger.info('Deleting index {} command: {}'.format(collection, command)) - output = regression_utils.run_shell_command(command, logger, echo=True) + regression_utils.run_shell_command(command, logger, echo=True) return not self.does_index_exist(collection) - else: raise Exception('The index {} does not exist!'.format(collection)) + else: + raise Exception('The index {} does not exist!'.format(collection)) def create_index(self, collection): # Make sure the index does not exist: @@ -67,7 +73,7 @@ def create_index(self, collection): self.upload_configs() command = 'solrini/bin/solr create -n anserini -c {}'.format(collection) logger.info('Creating index {} command: {}'.format(collection, command)) - output = regression_utils.run_shell_command(command, logger, echo=True) + regression_utils.run_shell_command(command, logger, echo=True) return self.does_index_exist(collection) else: raise Exception('The index {} already exists!'.format(collection)) @@ -78,43 +84,45 @@ def insert_docs(self, collection, path): raise Exception('{} does not exist!'.format(args.input)) if not self.does_index_exist(collection): raise Exception('The index {} does not exist!'.format(collection)) - command = '' if collection == 'core18': command = 'sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection ' + \ '-generator WashingtonPostGenerator -solr -solr.index core18 -solr.zkUrl localhost:9983 ' + \ '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeContents' elif collection == 'robust04': command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -solr -solr.index robust04 -solr.zkUrl localhost:9983 ' + \ + '-generator DefaultLuceneDocumentGenerator ' + \ + '-solr -solr.index robust04 -solr.zkUrl localhost:9983 ' + \ '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw' elif collection == 'msmarco-passage': command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -solr -solr.index msmarco-passage -solr.zkUrl localhost:9983 ' + \ + '-generator DefaultLuceneDocumentGenerator ' + \ + '-solr -solr.index msmarco-passage -solr.zkUrl localhost:9983 ' + \ '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw' elif collection == 'msmarco-doc': command = 'sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection ' + \ - '-generator DefaultLuceneDocumentGenerator -solr -solr.index msmarco-doc -solr.zkUrl localhost:9983 ' + \ + '-generator DefaultLuceneDocumentGenerator ' + \ + '-solr -solr.index msmarco-doc -solr.zkUrl localhost:9983 ' + \ '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRaw' else: raise Exception('Unknown collection: {}'.format(collection)) logger.info('Running indexing command: ' + command) return regression_utils.run_shell_command(command, logger, echo=True) - def upload_configs(self): + @staticmethod + def upload_configs(): os.chdir('src/main/resources/solr') command = 'rm -rf anserini/conf/lang anserini-twitter/conf/lang' logger.info('Deleting existed configs command: ' + command) - output = regression_utils.run_shell_command(command, logger, echo=True) + regression_utils.run_shell_command(command, logger, echo=True) command = './solr.sh ../../../../solrini localhost:9983' logger.info('Uploading configs command: ' + command) - output = regression_utils.run_shell_command(command, logger, echo=True) + regression_utils.run_shell_command(command, logger, echo=True) os.chdir('../../../..') logger.info('Uploading complete!') def evaluate(self, collection): if not self.does_index_exist(collection): raise Exception('The index {} does not exist!'.format(collection)) - command = '' if collection == 'core18': command = 'sh target/appassembler/bin/SearchSolr -topicreader Trec -solr.index core18 ' + \ '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.core18.txt ' + \ @@ -125,17 +133,19 @@ def evaluate(self, collection): '-output runs/run.solr.robust04.bm25.topics.robust04.txt' elif collection == 'msmarco-passage': command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvString -solr.index msmarco-passage ' + \ - '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \ + '-solr.zkUrl localhost:9983 ' + \ + '-topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \ '-output runs/run.solr.msmarco-passage.txt' elif collection == 'msmarco-doc': command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvInt -solr.index msmarco-doc ' + \ - '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \ + '-solr.zkUrl localhost:9983 ' + \ + '-topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \ '-output runs/run.solr.msmarco-doc.txt ' else: raise Exception('Unknown collection: {}'.format(collection)) logger.info('Retrieval command: ' + command) - output = regression_utils.run_shell_command(command, logger, echo=True) + regression_utils.run_shell_command(command, logger, echo=True) logger.info('Retrieval complete!') if collection == 'core18': @@ -143,10 +153,12 @@ def evaluate(self, collection): 'src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.solr.core18.bm25.topics.core18.txt' elif collection == 'robust04': command = 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \ - 'src/main/resources/topics-and-qrels/qrels.robust04.txt runs/run.solr.robust04.bm25.topics.robust04.txt' + 'src/main/resources/topics-and-qrels/qrels.robust04.txt ' + \ + 'runs/run.solr.robust04.bm25.topics.robust04.txt' elif collection == 'msmarco-passage': command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ - 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.solr.msmarco-passage.txt' + 'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ' + \ + 'runs/run.solr.msmarco-passage.txt' elif collection == 'msmarco-doc': command = 'tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \ 'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.solr.msmarco-doc.txt' @@ -157,28 +169,37 @@ def evaluate(self, collection): output = regression_utils.run_shell_command(command, logger, capture=True) ap = float(output[0].split('\t')[2]) - expected = 0 - if collection == 'core18': expected = 0.2495 - elif collection == 'robust04': expected = 0.2531 - elif collection == 'msmarco-passage': expected = 0.1926 - elif collection == 'msmarco-doc': expected = 0.2310 - else: raise Exception('Unknown collection: {}'.format(collection)) - - if math.isclose(ap, expected): logger.info('[SUCESS] {} MAP verified as expected!'.format(ap)) - else: logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected)) + if collection == 'core18': + expected = 0.2495 + elif collection == 'robust04': + expected = 0.2531 + elif collection == 'msmarco-passage': + expected = 0.1926 + elif collection == 'msmarco-doc': + expected = 0.2310 + else: + raise Exception('Unknown collection: {}'.format(collection)) + if math.isclose(ap, expected): + logger.info('[SUCESS] {} MAP verified as expected!'.format(ap)) + else: + logger.info('[FAILED] {} MAP, expected {} MAP!'.format(ap, expected)) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Program for running Solr regressions.') parser.add_argument('--ping', action='store_true', default=False, help='ping Solr and exit') - parser.add_argument('--check-index-exists', default='', type=str, metavar='collection', help='check if index exists') - parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='deletes index') - parser.add_argument('--create-index', default='', type=str, metavar='collection', help='creates index') - parser.add_argument('--insert-docs', default='', type=str, metavar='collection', help='insert documents into index') - parser.add_argument('--input', default='', type=str, metavar='directory', help='location of documents to insert into index') - parser.add_argument('--evaluate', default='', type=str, metavar='collection', help='search and evaluate on collection') - parser.add_argument('--regression', default='', type=str, metavar='collection', help='run end-to-end regression') + parser.add_argument('--check-index-exists', default='', type=str, metavar='collection', + help='Check if index exists.') + parser.add_argument('--delete-index', default='', type=str, metavar='collection', help='Delete index.') + parser.add_argument('--create-index', default='', type=str, metavar='collection', help='Create index.') + parser.add_argument('--insert-docs', default='', type=str, metavar='collection', + help='Insert documents into index.') + parser.add_argument('--input', default='', type=str, metavar='directory', + help='Location of documents to insert into index.') + parser.add_argument('--evaluate', default='', type=str, metavar='collection', + help='Search and evaluate on collection.') + parser.add_argument('--regression', default='', type=str, metavar='collection', help='Run end-to-end regression.') args = parser.parse_args() solr = SolrClient()