From 93d913f2619c44451be3d46816c7b9c44cbeb091 Mon Sep 17 00:00:00 2001
From: Yifan Yang <y454yang@uwaterloo.ca>
Date: Wed, 9 Sep 2020 20:19:37 +0800
Subject: [PATCH] Remove paragraph-indexing module (#1375)

---
 src/main/python/paragraph_indexing/README.md  |  86 ------
 .../python/paragraph_indexing/__init__.py     |   0
 src/main/python/paragraph_indexing/paraseg.py | 258 ------------------
 .../python/paragraph_indexing/seg_core17.py   |  78 ------
 .../python/paragraph_indexing/seg_robust04.py |  90 ------
 src/main/python/paragraph_indexing/utils.py   |  80 ------
 6 files changed, 592 deletions(-)
 delete mode 100644 src/main/python/paragraph_indexing/README.md
 delete mode 100644 src/main/python/paragraph_indexing/__init__.py
 delete mode 100644 src/main/python/paragraph_indexing/paraseg.py
 delete mode 100644 src/main/python/paragraph_indexing/seg_core17.py
 delete mode 100644 src/main/python/paragraph_indexing/seg_robust04.py
 delete mode 100644 src/main/python/paragraph_indexing/utils.py

diff --git a/src/main/python/paragraph_indexing/README.md b/src/main/python/paragraph_indexing/README.md
deleted file mode 100644
index 097ecd093b..0000000000
--- a/src/main/python/paragraph_indexing/README.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# Paragraph Indexing
-
-
-
-## Segment
-
-Segment each raw document into paragraph and dump out into seperate .json file named with DOCID in json format, e.g.
-
-```
-[
-    {
-        'id':'{$DOCNO}.0001',
-        'content':'content0001'
-	},
-    {
-        'id':'{$DOCNO}.0002',
-        'content':'content0001'
-    }
-]
-```
-
-This is done by calling `seg_${collection}.py`, where supported collections so far are `robust04` and `core17`
-
-
-
-### Example:
-
-Run 
-
-```
-python seg_robust04.py \
- --input lucene-index.robust04.pos+docvectors+rawdocs.allDocids.txt.output.tar.gz \
- --output robust04.paragraphs/
-```
-
-All documents will be segmented into paragraph and stored in folder `./robust04.paragraphs/`
-
-
-
-### Input file
-
-The input raw documents should be a `tar.gz` file containing each document in a seperate file named as DOCID. This file can be generated through following command (e.g Robust04)
-
-Suppose you're under Anserini directory. First indexing
-
-```bash
-nohup sh target/appassembler/bin/IndexCollection -collection TrecCollection \
- -input /path/to/disk45/ -generator JsoupGenerator \
- -index lucene-index.robust04.pos+docvectors+rawdocs -threads 16 \
- -storePositions -storeDocvectors -storeRawDocs -optimize \
- >& log.robust04.pos+docvectors+rawdocs &
-```
-
-and then dump the raw documents by the following two steps:
-1. dump all docids of the collection
-2. feed the docids file to dump raw documents
-
-```bash
-sh target/appassembler/bin/IndexUtils \
- -index lucene-index.robust04.pos+docvectors+rawdocs \
- -dumpAllDocids NONE &&
-sh target/appassembler/bin/IndexUtils \
- -index lucene-index.robust04.pos+docvectors+rawdocs \
- -dumpRawDocs lucene-index.robust04.pos+docvectors+rawdocs.allDocids.txt
-```
-
-and the output `tar.gz` file will be named as 
-
-```
-lucene-index.robust04.pos+docvectors+rawdocs.allDocids.txt.output.tar.gz
-```
-
-
-
-## Paragraph Indexing
-
-The json file can be indexed using `JsonCollection` in Anserini. Run
-
-```bash
-sh target/appassembler/bin/IndexCollection -collection JsonCollection \
- -input /path/to/robust04.paragraphs -generator LuceneDocumentGenerator \
- -index lucene-index.robust04.paragraphs.pos+docvectors+rawdocs -threads 16 \
- -storePositions -storeDocvectors -storeRawDocs -optimize  &&
-```
-
-to index each paragraph for Robust04 collection. `-input` should be the output folder of the paragraph segmentation
diff --git a/src/main/python/paragraph_indexing/__init__.py b/src/main/python/paragraph_indexing/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/src/main/python/paragraph_indexing/paraseg.py b/src/main/python/paragraph_indexing/paraseg.py
deleted file mode 100644
index 84f8c0d049..0000000000
--- a/src/main/python/paragraph_indexing/paraseg.py
+++ /dev/null
@@ -1,258 +0,0 @@
-"""
-Anserini: A toolkit for reproducible information retrieval research built on Lucene
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-class _ParaSegmenter(object):
-    """ The base class for all Paragraph Segmentation class """
-    def __init__(self, bufferedreader, start_pattern_list=None):
-        self._br = bufferedreader
-        self._isstart = False
-        self._curline = None
-        self._paralist = []
-        self._setup(start_pattern_list)
-
-    def _setup(self, pattern_list):
-        """ Find the start of the first paragraph of input document
-        This can only be called once when initializing the object
-
-        Args:
-            pattern_list(list): an indicator of starting a paragraph
-
-        Effect:
-            self._isstart set to True if found, else remains False
-            If <code>pattern_list</code> is None, treated as started
-        """
-        if pattern_list is None:
-            self._isstart = True
-            return
-
-        while True:
-            self._curline = self._br.readline()
-            if not self._curline:
-                return
-
-            if self._curline in pattern_list:
-                self._isstart = True
-                return
-
-    def _isend(self, line):
-        """ An indicator of the a paragraph's end. The code in this base class indicates
-        the end of a documents, as the end of a document indicates the end of a paragraph as well.
-
-        Args:
-            line(str): the line to be tested on
-
-        Return:
-            (bool): if this line is a paragraph end
-        """
-        if not line or line == b'</TEXT>':
-            return True
-
-        return False
-
-    def hasnextpara(self):
-        """ Check if there is a paragraph in this document
-
-        If self._isstart == False after initialization, no useful information is contained in
-        this document, then return False
-        If reach the end of the doc, return False
-
-        Return:
-            (bool) if this document has further content
-
-        """
-        if not self._isstart or not self._curline or self._curline == b'</TEXT>':
-            return False
-        return True
-
-    def nextpara(self):
-        """ Two cases here:
-
-        1. There is a pattern indicating a new paragraph followed by self._curline,
-            In this case, after calling self.hasnextpara(), len(self._paralist) == 0.
-            In this case, also, subclasses should fill self._paralist in `self.hasnextpara()`
-        2. There is only a pattern indicating the end of a paragraph,
-            so one has to readline until the end to see if it is a paragraph.
-            In this case, after calling self.hasnextpara() and return True,
-            len(self._paralist) > 0
-
-        Return:
-            str: A string contains a paragraph
-        """
-        if not self._paralist:
-            while True:
-                self._curline = self._br.readline()
-                if self._isend(self._curline):
-                    break
-                self._paralist.append(self._curline.decode('utf-8').strip())
-
-        parastr = ' '.join(self._paralist)
-        del self._paralist[:]
-        return parastr
-
-
-class FBISParaSegmenter(_ParaSegmenter):
-    """ A Segmenter to segment documents in FBIS collection under Robust04.
-
-    Args:
-        bufferedreader (io.BufferedReader): the buffered reader of a document.
-    """
-    def __init__(self, bufferedreader):
-        start_pattern_list = [b'\n'] # start pattern by observation
-
-        super(FBISParaSegmenter, self).__init__(bufferedreader, start_pattern_list)
-        self._linelimit = 50 # An empirical number to decide if this is end of paragraph
-
-    def _isend(self, line):
-        if super(FBISParaSegmenter, self)._isend(line):
-            return True
-
-        if line[-2:] == b'.\n' and len(line) < self._linelimit:
-            return True
-
-        return False
-
-    def hasnextpara(self):
-        if not super(FBISParaSegmenter, self).hasnextpara():
-            return False
-
-        while True:
-            self._curline = self._br.readline()
-            if self._isend(self._curline):
-                break
-            self._paralist.append(self._curline.decode('utf-8').strip())
-
-        if not self._paralist:
-            # Handle the following pattern
-            # b'\n'
-            # b'</Text>\n'
-            return False
-
-        if len(self._curline) > 1:
-            # skip if self._curline == b'\n'
-            self._paralist.append(self._curline.decode('utf-8').strip())
-
-        return True
-
-
-class FR94ParaSegmenter(_ParaSegmenter):
-    """ A Segmenter to segment documents in FR94 collection under Robust04.
-
-    Args:
-        bufferedreader (io.BufferedReader): the buffered reader of a document.
-    """
-    def __init__(self, bufferedreader):
-        self._start_pattern_list = [
-            b'<!-- PJG 0012 frnewline -->\n',
-            b'<!-- PJG ITAG l=11 g=1 f=1 -->\n'
-        ] # start pattern by observation
-        super(FR94ParaSegmenter, self).__init__(bufferedreader, self._start_pattern_list)
-
-    def _isend(self, line):
-        if super(FR94ParaSegmenter, self)._isend(line):
-            return True
-        if line[:4] == b'<!--':
-            return True
-        return False
-
-    def hasnextpara(self):
-        if not self._isstart:
-            return False
-
-        # find start pattern
-        while self._curline:
-            if self._curline in self._start_pattern_list:
-                return True
-            self._curline = self._br.readline()
-
-        return False
-
-
-class FTParaSegmenter(_ParaSegmenter):
-    """ A Segmenter to segment documents in FT collection under Robust04.
-
-    Args:
-        bufferedreader (io.BufferedReader): the buffered reader of a document.
-    """
-    def __init__(self, bufferedreader):
-        start_pattern_list = [b'<TEXT>\n'] # start pattern by observation
-        super(FTParaSegmenter, self).__init__(bufferedreader, start_pattern_list)
-        self._linelimit = 105 # An empirical number to decide if this is end of paragraph
-
-    def _isend(self, line):
-        if super(FTParaSegmenter, self)._isend(line):
-            return True
-        if line[-2:] == b'.\n' and len(line) < self._linelimit:
-            return True
-        return False
-
-    def hasnextpara(self):
-        if not super(FTParaSegmenter, self).hasnextpara():
-            return False
-
-        while True:
-            self._curline = self._br.readline()
-            self._paralist.append(self._curline.decode('utf-8').strip())
-            if self._isend(self._curline):
-                break
-
-        return True
-
-
-class LAParaSegmenter(_ParaSegmenter):
-    """ A Segmenter to segment documents in LA collection under Robust04.
-
-    Args:
-        bufferedreader (io.BufferedReader): the buffered reader of a document.
-    """
-    def __init__(self, bufferedreader):
-        self._start_pattern_list = [b'<P>\n']
-        super(LAParaSegmenter, self).__init__(bufferedreader, self._start_pattern_list)
-
-    def _isend(self, line):
-        if super(LAParaSegmenter, self)._isend(line):
-            return True
-        if line == b'</P>\n':
-            return True
-        return False
-
-    def hasnextpara(self):
-        if not super(LAParaSegmenter, self).hasnextpara():
-            return False
-
-        while self._curline:
-            if self._curline in self._start_pattern_list:
-                return True
-            self._curline = self._br.readline()
-
-        return False
-
-
-
-class NYTParaSegmenter(_ParaSegmenter):
-    """ A Segmenter to segment documents in New York Times collection under Core17.
-
-    Args:
-        bufferedreader (io.BufferedReader): the buffered reader of a document.
-    """
-    def __init__(self, bufferedreader):
-        super(NYTParaSegmenter, self).__init__(bufferedreader)
-
-    def hasnextpara(self):
-        self._curline = self._br.readline()
-        return len(self._curline) > 0
-
-    def nextpara(self):
-        return self._curline.decode('utf-8').strip()
diff --git a/src/main/python/paragraph_indexing/seg_core17.py b/src/main/python/paragraph_indexing/seg_core17.py
deleted file mode 100644
index adbd02984a..0000000000
--- a/src/main/python/paragraph_indexing/seg_core17.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""
-Anserini: A toolkit for reproducible information retrieval research built on Lucene
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import argparse
-import json
-import logging
-import os
-
-from .paraseg import NYTParaSegmenter
-from .utils import TgzReader, safe_mkdir, form_json
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.DEBUG,
-                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S ')
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input", '-i', type=str,
-                        help='path to input tgz file', required=True)
-    parser.add_argument("--output", '-o', type=str,
-                        help='path to output folder', required=True)
-
-    args = parser.parse_args()
-    input_path = args.input
-    output_path = args.output
-
-    # preprocessing
-    safe_mkdir(output_path)
-
-    # start to segment
-    counter = 0
-
-    logging.info('start to segment files from %s', input_path)
-    reader = TgzReader(input_path)
-    while reader.hasnext():
-        counter += 1
-        if counter % 100000 == 0:
-            logging.info('%s files have been processed', counter)
-
-        docname, content_buffer = reader.next()
-        segmenter = NYTParaSegmenter(content_buffer)
-
-        paraid = 0
-        parajsonarray = []
-        while segmenter.hasnextpara():
-            parastr = segmenter.nextpara()
-
-            paraid += 1
-            if paraid >= 10000:
-                logging.info('document %s has more than 10000 paragraphs...', docname)
-                break
-            parajsonarray.append(form_json(docname, paraid, parastr))
-
-        # This is an empty file
-        if paraid == 0:
-            paraid += 1
-            parastr = ''
-            parajsonarray.append(form_json(docname, paraid, parastr))
-
-        jsonstr = json.dumps(parajsonarray, separators=(',', ':'), indent=2)
-
-        with open(os.path.join(output_path, '{}.json'.format(docname)), 'w') as f:
-            f.write(jsonstr)
-
-    logging.info('%d files have been segmented into paragraphs stored in %s', counter, output_path)
-    reader.close()
diff --git a/src/main/python/paragraph_indexing/seg_robust04.py b/src/main/python/paragraph_indexing/seg_robust04.py
deleted file mode 100644
index 24a12771d6..0000000000
--- a/src/main/python/paragraph_indexing/seg_robust04.py
+++ /dev/null
@@ -1,90 +0,0 @@
-"""
-Anserini: A toolkit for reproducible information retrieval research built on Lucene
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import argparse
-import json
-import logging
-import os
-
-from .paraseg import FBISParaSegmenter, FR94ParaSegmenter, FTParaSegmenter, LAParaSegmenter
-from .utils import TgzReader, safe_mkdir, form_json
-
-SEGMENTER = {
-    'FB': FBISParaSegmenter,
-    'FR': FR94ParaSegmenter,
-    'FT': FTParaSegmenter,
-    'LA': LAParaSegmenter
-}
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.DEBUG,
-                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S ')
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input", '-i', type=str,
-                        help='path to input tgz file', required=True)
-    parser.add_argument("--output", '-o', type=str,
-                        help='path to output folder', required=True)
-
-    args = parser.parse_args()
-    input_path = args.input
-    output_path = args.output
-
-    # preprocessing
-    safe_mkdir(output_path)
-
-    # start to segment
-    counter = 0
-
-    logging.info('start to segment files from %s', input_path)
-    reader = TgzReader(input_path)
-    while reader.hasnext():
-        counter += 1
-        if counter % 100000 == 0:
-            logging.info('%d files have been processed', counter)
-
-        docname, content_buffer = reader.next()
-        if docname[:2] in SEGMENTER:
-            segmenter = SEGMENTER[docname[:2]](content_buffer)
-        else:
-            raise TypeError('Invalid file type')
-
-        paraid = 0
-        parajsonarray = []
-        while segmenter.hasnextpara():
-            parastr = segmenter.nextpara()
-            if len(parastr) < 50:
-                continue
-
-            paraid += 1
-            if paraid >= 10000:
-                logging.info('document %s has more than 10000 paragraphs...', docname)
-                break
-            parajsonarray.append(form_json(docname, paraid, parastr))
-
-        # This is an empty file
-        if paraid == 0:
-            paraid += 1
-            parastr = ''
-            parajsonarray.append(form_json(docname, paraid, parastr))
-
-        jsonstr = json.dumps(parajsonarray, separators=(',', ':'), indent=2)
-
-        with open(os.path.join(output_path, '{}.json'.format(docname)), 'w') as f:
-            f.write(jsonstr)
-
-    logging.info('%d files have been segmented into paragraphs stored in %s', counter, output_path)
-    reader.close()
diff --git a/src/main/python/paragraph_indexing/utils.py b/src/main/python/paragraph_indexing/utils.py
deleted file mode 100644
index 20af4973bf..0000000000
--- a/src/main/python/paragraph_indexing/utils.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""
-Anserini: A toolkit for reproducible information retrieval research built on Lucene
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import os
-import tarfile
-
-class TgzReader(object):
-    """ A Reader to read tar.gz with multiple raw document files efficiently.
-
-    A tar.gz file has the following structure:
-
-        DOCID_1:
-            RAW DOCS 1
-        DOCID_2:
-            RAW DOCS 2
-        ...
-
-    Args:
-        path(str): the path of input tar.gz file
-    """
-    def __init__(self, path):
-        self._path = path
-        self._tar = tarfile.open(path, "r:gz")
-        self._next = None
-
-    def close(self):
-        """ close the reader stream. """
-        self._tar.close()
-
-    def hasnext(self):
-        """ return whether the tar file has files or not
-
-        Returns:
-            bool: whether the tar file has files or not
-        """
-        self._next = self._tar.next()
-        if not self._next:
-            return False
-        if not self._next.isfile():
-            return self.hasnext()
-        return True
-
-    def next(self):
-        """ get the next tf-idf files
-
-        Returns:
-            (str, io.BytesIO): the file name and a bytes io stream containing doc contents.
-
-        Raises:
-            ValueError: if there is not next entry in this tar file.
-        """
-        if self._next:
-            return self._next.name, self._tar.extractfile(self._next)
-        raise ValueError("No files.")
-
-def safe_mkdir(path):
-    """ create directory if not exists """
-    if not os.path.exists(path):
-        os.mkdir(path)
-
-def form_json(doc_name, para_id, content):
-    """ form a document into json format """
-    doc = {
-        'id': '{}.{:04d}'.format(doc_name, para_id),
-        'contents': content
-    }
-    return doc