wordbreaker.py

#!/usr/bin/env python3

# Word segmentation
# John Goldsmith 2014-

# code refactoring/optimization in progress, Jackson Lee, 7/6/2015

# audrey  08/2015
# TODO: In ReadBrokenCorpus(), separate additional punctuation marks from preceding word. e.g. comma


import os
import math
import argparse
import sys
import json
import jsonpickle
from pathlib import Path
import copy

import time
import datetime

from latexTable_py3 import MakeLatexTable
from lxa5lib import (get_language_corpus_datafolder, json_pdump,
                     changeFilenameSuffix, stdout_list,
                     load_config_for_command_line_help)


def makeArgParser(configfilename="config.json"):

    language, \
    corpus, \
    datafolder, \
    configtext = load_config_for_command_line_help(configfilename)

    parser = argparse.ArgumentParser(
        description="This program segments unbroken text into words.\n\n{}"
                    .format(configtext),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--config", help="configuration filename",
                        type=str, default=configfilename)

    parser.add_argument("--ibase", help="iterative processing for this run will start at iteration # (ibase+1); use '0' for fresh start.",
                        type=int, default=None)
    parser.add_argument("--itarget", help="last iteration to perform in this run;\n thus # cycles = itarget - ibase",
                        type=int, default=None)
    parser.add_argument("--statefile", help="jsonpickle file needed to restore previous state in order to resume processing; not needed if ibase==0",
                type=str, default=None)
    parser.add_argument("--candidates", help="number of candidates per iteration",
                        type=int, default=25)
    parser.add_argument("--corpuslinestoread", help="number of lines to read in from corpus (defaults to 'sys.maxsize' to read entire corpus)",
                        type=int, default=sys.maxsize)
    parser.add_argument("--verbose", help="verbose output",
                        type=bool, default=False)

    parser.add_argument("--language", help="Language name",
                        type=str, default=None)
    parser.add_argument("--corpus", help="Corpus file to use",
                        type=str, default=None)
    parser.add_argument("--datafolder", help="path of the data folder",
                        type=str, default=None)

    return parser


class LexiconEntry:
    def __init__(self, key = "", count = 0):
        self.m_Key = key
        self.m_Count = count
        self.m_Frequency= 0.0
        self.m_ParentWords = list()
        self.m_ChildWords = list()
        self.m_Suffix = False
        self.m_Stem = False
        self.m_CountRegister = list()


    def UpdateRegister(self, current_iteration):
        if len(self.m_CountRegister) > 0:
            last_count = self.m_CountRegister[-1][1]
            if self.m_Count != last_count or self.m_ChildWords != []:
                self.m_CountRegister.append((current_iteration, self.m_Count, self.m_ChildWords))
        else:
            self.m_CountRegister.append((current_iteration, self.m_Count, self.m_ChildWords))
        #self.m_Count = 0


    def Display(self, outfile):
        print(self.m_Key, "   ", "{:.7f}".format(self.m_Frequency), "   ", "{:.7f}".format(-1* math.log(self.m_Frequency, 2)), file = outfile)
        if len(self.m_ParentWords) > 0:
            expression = "/".join( self.m_ParentWords )
            #print("expression is ", expression)
            print("%s" % ("{:<}".format(expression)), file=outfile)

        for (iteration_number, count, childwords) in self.m_CountRegister:
                if childwords == []:
                    #print("{0:6}".format(iteration_number), " ",  "{0:10}".format(count), file=outfile)
                    print("%6i %10s" % (iteration_number, "{:,}".format(count)), file=outfile)
                else:
                    #print("{0:6}".format(iteration_number), " ",  "{0:10}".format(count), ":<".format(childwords), file=outfile)
                    print("%6i %10s    %-100s" % (iteration_number, "{:,}".format(count),  childwords), file=outfile)

# ---------------------------------------------------------#
class Lexicon:
    def __init__(self):
        self.m_TrueDictionary = dict()
        self.m_LetterPlog = dict()
        self.m_EntryDict = dict()
        self.m_DictionaryCost = 0   #in bits!
        #self.m_BigramDict = dict()               # November 20, 2015  # not yet enabled
        #self.m_TrigramDict = dict()              # November 20, 2015
        self.m_Corpus     = list()
        self.m_SizeOfLongestEntry = 0
        self.m_CorpusCost = 0.0
        self.m_ParsedCorpus = list()
        self.m_NumberOfHypothesizedRunningWords = 0
        self.m_NumberOfTrueRunningWords = 0
        self.m_BreakPointList = list()
        self.m_DeletionList = list()  # these are the words that were nominated and then not used in any line-parses *at all*.
        self.m_DeletionDict = dict()  # They never stop getting nominated.
        self.m_Break_based_RecallPrecisionHistory = list()
        self.m_Token_based_RecallPrecisionHistory = list()
        self.m_Type_based_RecallPrecisionHistory = list()
        self.m_DictionaryCostHistory = list()
        self.m_CorpusCostHistory = list()

    # ---------------------------------------------------------#  
    #def AddEntry(self,key,count):
    #    this_entry = LexiconEntry(key,count)
    #    self.m_EntryDict[key] = this_entry
    #    if len(key) > self.m_SizeOfLongestEntry:
    #        self.m_SizeOfLongestEntry = len(key)
    # ---------------------------------------------------------#
    def AddEntry(self,key,new_entry):    #was AddEntry(self,key,count)
        #this_entry = LexiconEntry(key,count)
        self.m_EntryDict[key] = copy.deepcopy(new_entry)
        if len(key) > self.m_SizeOfLongestEntry:
            self.m_SizeOfLongestEntry = len(key)
        return self.m_EntryDict[key]
    # ---------------------------------------------------------#

    # Found bug here July 5 2015: important, don't let it remove a singleton letter! John
    # don't del key-value pairs within still iterating through the pairs among the dict; fix by John Goldsmith July 6 2015
    def FilterZeroCountEntries(self, outfile):
        TempDeletionList = list()
        for key, entry in self.m_EntryDict.items():
            if len(key) == 1 and entry.m_Count==0:
                entry.m_Count = 1
                continue
            if len(key)>1 and entry.m_Count == 0:   # DO WE REALLY WANT TO DISALLOW ITS USE FOR THE FUTURE?
                # Transfer this entry from the EntryDict to the DeletionDict (Keep a list but don't delete yet!)
                TempDeletionList.append(key)
                self.m_DeletionDict[key] = entry

        for key in TempDeletionList:
            del self.m_EntryDict[key]
            print("---> Deleted ", key)
            print("---> Deleted ", key, file=outfile)

    # ---------------------------------------------------------#
    def UpdateLexEntryRegisters(self, iteration_number):
        for key, entry in self.m_EntryDict.items():
            entry.UpdateRegister(iteration_number)
    # ---------------------------------------------------------#
#
#    def CountBigrams(self):         # not yet enabled
#    	self.m_BigramDict = dict()
#		for parsed_line in self.m_ParsedCorpus:
#			for wordno in range(len(parsed_line)-2):  # (FOR NOW)  SHOULD YOU PREDICT THE END PUNC OR END-OF-LINE?  TRY IT WITH AND WITHOUT  Nov. 21, 2015
#				word0 = parsed_line[wordno]
#                word1 = parsed_line[wordno + 1]
            
    # ---------------------------------------------------------#


    #-------------------------------------------------------#
    #  Populate data structures for corpus and dictionary.  #
    #  Record truth for measuring performance.              #
    #  Compute initial values for metrics.                  #
    #-------------------------------------------------------#
    def ReadBrokenCorpus(self, infilepathname, outfile, howmuchcorpus):
        print("Name of data file: ", infilepathname)
        if not os.path.isfile(infilepathname):  # Note: already checked at startup by get_language_corpus_datafolder()
            print("Warning:", infilepathname, "does not exist.")
            print("Check file paths and names.")
            sys.exit()
        infile = open(infilepathname)

        rawcorpus_list = infile.readlines() # bad code if the corpus is very large -- but then we won't use python.
        if howmuchcorpus != "all":
            rawcorpus_list = rawcorpus_list[0:int(howmuchcorpus)]    # 0 <= index < corpuslinestoread
        print("Number of lines read =", len(rawcorpus_list))

        for line in rawcorpus_list:
            this_line = ""
            breakpoint_list_forline = list()

            # Clean up data as desired   # This is for the sake of the TrueDictionary 
            line = line.casefold()
            line = line.replace(".", " . ")
            line = line.replace(",", " , ")
            line = line.replace(";", " ; ")
            line = line.replace("!", " ! ")
            line = line.replace("?", " ? ")
            line = line.replace(":", " : ")
            line = line.replace(")", " ) ")
            line = line.replace("(", " ( ")

            line_list = line.split()                           # split line into words
            if len(line_list) <=  1:
                continue
            for word in line_list:                             # by word in line
                self.m_NumberOfTrueRunningWords += 1               # build up TrueDictionary
                if word not in self.m_TrueDictionary:              # build up unbroken line
                    self.m_TrueDictionary[word] = 1                # build up breakpoint list
                else:
                    self.m_TrueDictionary[word] += 1
                this_line += word
                breakpoint_list_forline.append(len(this_line))


            #for letter in line:
            for letter in this_line:                           # by character in unbroken line
                if letter not in self.m_EntryDict:                 # build up dictionary
                    this_entry = LexiconEntry()
                    this_entry.m_Key = letter
                    this_entry.m_Count = 1   #counts occurrences of letter in the corpus
                    self.AddEntry(letter, this_entry)
                else:
                    self.m_EntryDict[letter].m_Count += 1

            self.m_Corpus.append(this_line)                    # by unbroken line
            self.m_BreakPointList.append(breakpoint_list_forline)  # build up unbroken corpus

        print("# " + str(len(self.m_TrueDictionary)) + " distinct words in the original corpus.", file=outfile)   # Goes into outfile header

		#----------------------------#
		
        # AT THIS POINT WE HAVE
            # the data:        m_Corpus         unbroken text
            # the dictionary:  m_EntryDict      initial version, entries for single characters

        # HERE IS THE INITIAL PARSE
        for line in self.m_Corpus:
            self.m_ParsedCorpus.append(list(line))    #Parsing is easy!

        self.m_NumberOfHypothesizedRunningWords = 0
        for (key, entry) in self.m_EntryDict.items():
            self.m_NumberOfHypothesizedRunningWords += entry.m_Count

        # PLOGS
        self.m_CorpusCost = 0.0
        self.m_DictionaryCost = 0.0
        self.m_SizeOfLongestEntry = 1
        self.ComputeDictFrequencies()

        for key, entry in self.m_EntryDict.items():
            this_plog = -1 * math.log(entry.m_Frequency, 2)      # applies to both (a)letter uses in data, AND (b)word uses in current parse
            self.m_LetterPlog[key] = this_plog                   # record it
            self.m_DictionaryCost += this_plog                   # as in (a)
            self.m_CorpusCost += entry.m_Count * this_plog       # as in (b)

            entry.m_CountRegister.append((0, entry.m_Count, []))  # iteration_number, count, childwords  as in UpdateRegister()


        print("\n Startup")
        self.Report(0, outfile)

        infile.close()


# ---------------------------------------------------------#
    def ComputeDictFrequencies(self):
        TotalCount = 0
        for (key, entry) in self.m_EntryDict.items():
            TotalCount += entry.m_Count
        for (key, entry) in self.m_EntryDict.items():
            entry.m_Frequency = entry.m_Count/float(TotalCount)
# ---------------------------------------------------------#
#    # added july 2015 john
#    def ComputeDictionaryLength(self):
#        DictionaryCost = 0
#        for word in self.m_EntryDict:
#            wordlength = 0
#            letters = list(word)
#            for letter in letters:
#                wordlength += self.m_LetterPlog[letter]
#            DictionaryLength += wordlength
#        self.m_DictionaryCost = DictionaryCost
#        self.m_DictionaryCostHistory.append(DictionaryCost)
#
# ---------------------------------------------------------#

    def ParseCorpus(self, current_iteration, outfile):
        self.m_ParsedCorpus = list()
        self.m_CorpusCost = 0.0
        self.m_NumberOfHypothesizedRunningWords = 0

        for word, entry in self.m_EntryDict.items():
            entry.m_Count = 0
        for line in self.m_Corpus:
            parsed_line,bit_cost =     self.ParseWord(line, outfile)
            self.m_ParsedCorpus.append(parsed_line)
            self.m_CorpusCost += bit_cost
            for word in parsed_line:
                self.m_EntryDict[word].m_Count +=1
                self.m_NumberOfHypothesizedRunningWords += 1
# ---------------------------------------------------------#
    def PrintParsedCorpus(self,outfile):
        for line in self.m_ParsedCorpus:
            PrintList(line, outfile)
# ---------------------------------------------------------#
    def ParseWord(self, word, outfile):
        wordlength = len(word)

        Parse=dict()
        Piece = ""
        LastChunk = ""
        BestCompressedLength = dict()
        BestCompressedLength[0] = 0
        CompressedSizeFromInnerScanToOuterScan = 0.0
        LastChunkStartingPoint = 0
        # <------------------ outerscan -----------><------------------> #
        #                  ^---starting point
        # <----prefix?----><----innerscan---------->
        #                  <----Piece-------------->
        if verboseflag: print("\nOuter\tInner", file=outfile)
        if verboseflag: print("scan:\tscan:\tPiece\tFound?", file=outfile)
        for outerscan in range(1,wordlength+1):
            Parse[outerscan] = list()
            MinimumCompressedSize= 0.0
            startingpoint = 0
            if outerscan > self.m_SizeOfLongestEntry:
                startingpoint = outerscan - self.m_SizeOfLongestEntry
            for innerscan in range(startingpoint, outerscan):
                if verboseflag: print("\n %3s\t%3s  " % (outerscan, innerscan), end=" ", file=outfile)
                Piece = word[innerscan: outerscan]
                if verboseflag: print(" %5s"% Piece, end=" ", file=outfile)
                if Piece in self.m_EntryDict:
                    if verboseflag: print("   %5s" % "Yes.", end=" ", file=outfile)
                    CompressedSizeFromInnerScanToOuterScan = -1 * math.log( self.m_EntryDict[Piece].m_Frequency, 2)
                    newvalue =  BestCompressedLength[innerscan]  + CompressedSizeFromInnerScanToOuterScan
                    if verboseflag: print(" %7.3f bits" % (newvalue), end=" ", file=outfile)
                    if  MinimumCompressedSize == 0.0 or MinimumCompressedSize > newvalue:
                        MinimumCompressedSize = newvalue
                        LastChunk = Piece
                        LastChunkStartingPoint = innerscan
                        if verboseflag: print(" %7.3f bits" % (MinimumCompressedSize), end=" ", file=outfile)
                else:
                    if verboseflag: print("   %5s" % "No. ", end=" ", file=outfile)
            BestCompressedLength[outerscan] = MinimumCompressedSize
            if LastChunkStartingPoint > 0:
                Parse[outerscan] = list(Parse[LastChunkStartingPoint])
            else:
                Parse[outerscan] = list()
            if verboseflag: print("\n\t\t\t\t\t\t\t\tchosen:", LastChunk, end=" ", file=outfile)
            Parse[outerscan].append(LastChunk)     # NOTICE  NOTICE  NOTICE  NOTICE  
        if verboseflag:
            PrintList(Parse[wordlength], outfile)
        bitcost = BestCompressedLength[outerscan]
        return (Parse[wordlength],bitcost)
# ---------------------------------------------------------#
    def GenerateCandidates_standardmethod(self, current_iteration, howmany, outfile):
            # m_ChildWords (for display from the CountRegister at this iteration only) are set in this function
            for word, lexicon_entry in self.m_EntryDict.items():
                    lexicon_entry.m_ChildWords = []

            CandidateDict = dict()      # key is the new word, value is a LexiconEntry object
            NomineeList = list()
            for parsed_line in self.m_ParsedCorpus:
                for wordno in range(len(parsed_line)-1):   # SO WHY DO WE HAVE 'said.'  ?  audrey  2015_11_21
                #for wordno in range(len(parsed_line)-2):  # under consideration 
                    word0 = parsed_line[wordno]
                    word1 = parsed_line[wordno + 1]
                    candidate = word0 + word1
                    if candidate in self.m_EntryDict:
                        continue
                    if candidate in CandidateDict:
                        CandidateDict[candidate].m_Count += 1
                    else:
                        this_entry = LexiconEntry(candidate)
                        this_entry.m_Count = 1
                        this_entry.m_ParentWords = [word0, word1]
                        CandidateDict[candidate] = this_entry

            SortableCandidateDict = dict()
            for candidate, lex_entry in CandidateDict.items():
                SortableCandidateDict[candidate] = lex_entry.m_Count
            #s = sorted(SortableCandidateDict.items(),key=lambda x:x[0], reverse=True)        # secondary sort: alphabetic 
            #EntireCandidateList = sorted(s,key=lambda x:x[1], reverse=True)                  # primary sort: by count
            EntireCandidateList = sorted(SortableCandidateDict.items(), key=lambda x:(x[1],x[0]), reverse=True)   #primary sort key is count, secondary is alphabetic

            for candidate, count in EntireCandidateList:
                #print("candidate = ", candidate)
                if candidate in self.m_DeletionDict:
                    continue
                NomineeList.append((candidate, CandidateDict[candidate]))
                if len(NomineeList) == howmany:
                	break
                

            print("Nominees:")
            latex_data= list()
            latex_data.append("piece   count   parent0    parent1   status")
            for nominee, entry in NomineeList:
                # for the new word
                entry.m_CountRegister.append((current_iteration, entry.m_Count, []))  # as in UpdateRegister()
                self.AddEntry(nominee,entry)                                           # This records the initial count--before parsing.
                                                                                          # If the actual count used in the parse differs,
                                                                                          # both records will appear for this iteration in the CountRegister.
                # for the trace
                for word in entry.m_ParentWords:
                    self.m_EntryDict[word].m_ChildWords.append(nominee)  #will go into m_CountRegister to display along with changed counts

                # for display
                expr = ""
                for x in entry.m_ParentWords:
                    expr = expr + "%12s" % (x)
                print("%15s  %10s %-50s" % (nominee, '{:,}'.format(entry.m_Count), expr))
                latex_data.append(nominee +  "\t" + '{:,} {}'.format(entry.m_Count, expr) )

            MakeLatexTable(latex_data,outfile)

            return NomineeList      # NOTE THAT THE RETURN VALUE IS NOT USED 

    # ---------------------------------------------------------#
#   def GenerateCandidates_suffixmethod(self):      # Similar to GenerateCandidates() [renamed as GenerateCandidates_standardmethod]
#
#       # Take in the suffixes from Linguistica
#       suffix_infile_name  = "Corpus" + str(current_iteration - 1) + "_1_Mini1_Suffixes.txt"
#       suffix_outfile_name = "Corpus" + str(current_iteration - 1) + ".Suffixes_extracted.txt"
#       suffixList = this_lexicon.ExtractListFromLing413File(suffix_infile_name, suffix_outfile_name, 0)
#       suffixList.sort()
#       #this_lexicon.AddSuffixMark(suffixList)   # PUT THIS IN WHEN VERSION HAS m_Suffix
#       print "Suffix List from", suffix_infile_name
#       for suffix in suffixList:
#           print suffix
#
#
#       # Create a dictionary of potential words
#       SuffixedCandidateDict = dict()              # similar to CandidateDict
#       for parsed_line in self.m_ParsedCorpus:
#           for wordno in range(len(parsed_line)-1):
#                   word0 = parsed_line[wordno]
#                   word1 = parsed_line[wordno + 1]
#                   # if self.m_EntryDict[word1].m_Suffix == True:     # PUT THIS IN WHEN VERSION HAS m_Suffix
#                   if suffixList.count(word1) > 0:    # Use this line when running LinkPrevious
#                   #if suffixList.count(word0) > 0:    # Use this line when running LinkFollowing
#                   suffixed_candidate = word0 + word1
#                   if suffixed_candidate in self.m_EntryDict:
#                       continue
#                   if suffixed_candidate in SuffixedCandidateDict:
#                       SuffixedCandidateDict[suffixed_candidate].m_ParseCount += 1
#                   else: # add it
#                       suffixed_entry = LexiconEntry(suffixed_candidate)
#                       suffixed_entry.m_ParseCount = 1
#                       suffixed_entry.m_Subwords = [word0, word1]
#                       SuffixedCandidateDict[suffixed_candidate] = suffixed_entry
#
#
#       # Behind the scenes, try a parse with these additional words
#       # to distinguish false suffixes and exclude unlikely formations
#       temp_lexicon = copy.deepcopy(self)
#       for candidate, entry in SuffixedCandidateDict.iteritems():
#           temp_lexicon.AddEntry(candidate, entry)
#
#       temp_lexicon.ComputeDictFrequencies()
#       temp_lexicon.ParseCorpus (current_iteration, outfile)   # (no actual print to outfile occurs)
#
#
#       # Compare counts from before and after the parse  (increase is good!)
#       DeltaByWord = dict()
#       for key, entry in SuffixedCandidateDict.iteritems():
#           BeforeParseCount = entry.m_ParseCount
#               AfterParseCount = temp_lexicon.m_EntryDict[key].m_ParseCount
#               DeltaByWord[key]  = AfterParseCount - BeforeParseCount
#
#       WordListOrderedByDelta = sorted(DeltaByWord.iteritems(),key=operator.itemgetter(1), reverse=True)
#
#       if True:            # OUTPUT COUNTS TO A FILE - CAN BE STUDIED IN EXCEL
#           i = datetime.datetime.now()
#           timelabel = "%s_%s_%s.%s%s" % (i.year, i.month, i.day, i.hour, i.minute)
#           counts_outfilename = "WordDeltaCounts_" + str(current_iteration)+"." + timelabel + ".txt"
#           counts_outfile = open(counts_outfilename, "w")
#           print >> counts_outfile, "     SuffixedCandidate      Before       After     Subword0    Subword1        Delta"
#           for key, delta in WordListOrderedByDelta:
#               expr = ""
#               for x in SuffixedCandidateDict[key].m_Subwords:
#                       expr = expr + "%12s" % (x)
#               print >> counts_outfile, "%22s  %10s  %10s %-30s  %5s" % \
#               (key, \
#               '{:,}'.format(SuffixedCandidateDict[key].m_ParseCount),        # BeforeParseCount \
#               '{:,}'.format(temp_lexicon.m_EntryDict[key].m_ParseCount),     # AfterParseCount  \
#               expr, '{:,}'.format(delta))
#           counts_outfile.close()
#
#
#       # COLLECT AND OUTPUT DELTA INFO BY SUFFIX
#       MiddleBandLimit = 1   # SET THIS
#       if MiddleBandLimit == 0:
#           NegMidBandLimit = -1
#       else:
#           NegMidBandLimit = -1 * MiddleBandLimit
#
#       # COLLECT
#       # The quantities below are accumulated per suffix based on delta values of the words formed on that suffix.
#       CountPosDeltas  = dict()    # how many words formed on given suffix have a positive delta
#       CountNegDeltas  = dict()
#       CountZeroDeltas = dict()
#
#       SumPosDeltas = dict()       # sum of their delta values
#       SumNegDeltas = dict()
#
#       for suffix in suffixList:
#           CountPosDeltas[suffix]  = 0
#           CountNegDeltas[suffix]  = 0
#           CountZeroDeltas[suffix] = 0
#
#           SumPosDeltas[suffix] = 0
#           SumNegDeltas[suffix] = 0
#
#       for key, delta in WordListOrderedByDelta:
#           suffix = SuffixedCandidateDict[key].m_Subwords[1]
#
#           if delta >= MiddleBandLimit:
#               CountPosDeltas[suffix] += 1
#               SumPosDeltas[suffix] += delta
#           elif delta <= NegMidBandLimit:
#               CountNegDeltas[suffix] += 1
#               SumNegDeltas[suffix] += delta
#           else:
#               CountZeroDeltas[suffix] +=1
#
#       # OUTPUT
#       # LOOKING FOR USEFUL PATTERNS; MOST LIKELY THIS IS NOT THE FINAL FORM
#       suffixstats_outfilename = "SuffixDeltaStats_"  + str(MiddleBandLimit) + "_" + str(current_iteration) + "." + timelabel + ".txt"
#       suffixstats_outfile = open(suffixstats_outfilename, "w")
#       print >> suffixstats_outfile, "  Suffix    #Pos    #Neg   #Zero    #All  SumPos  SumNeg  DeltaTotal    PNRatio     ROI"
#
#       CountAllDeltas = dict()
#       DeltaTotal = dict()
#       PosNegRatio = dict()
#       ROI = dict()
#       for suffix in suffixList:
#           CountAllDeltas[suffix] = CountPosDeltas[suffix] + CountNegDeltas[suffix] + CountZeroDeltas[suffix]
#           DeltaTotal[suffix] = SumPosDeltas[suffix] + SumNegDeltas[suffix]
#
#           if CountAllDeltas[suffix] != 0:
#               ROI[suffix] = float(DeltaTotal[suffix]) / CountAllDeltas[suffix]
#           else:
#               ROI[suffix] = 0
#
#           if SumNegDeltas[suffix] != 0:
#               PosNegRatio[suffix] = float(SumPosDeltas[suffix]) / abs(SumNegDeltas[suffix])
#           else:
#               PosNegRatio[suffix] = SumPosDeltas[suffix]
#           # may also decide to output an average over something or things tbd
#
#           print >> suffixstats_outfile, "%8s  %6s  %6s  %6s  %6s  %6s  %6s  %9s  %9s  %8s"  %  \
#               (suffix, \
#               '{:,}'.format(CountPosDeltas[suffix]),        \
#               '{:,}'.format(CountNegDeltas[suffix]),        \
#               '{:,}'.format(CountZeroDeltas[suffix]),       \
#               '{:,}'.format(CountAllDeltas[suffix]),        \
#               '{:,}'.format(SumPosDeltas[suffix]),          \
#               '{:,}'.format(SumNegDeltas[suffix]),          \
#               '{:,}'.format(DeltaTotal[suffix]),            \
#               '{:,.2f}'.format(PosNegRatio[suffix]),        \
#               '{:,.2f}'.format(ROI[suffix]))
#
#       suffixstats_outfile.close()
#
#
#
#       # SET UP A USEFUL DATA STRUCTURE
#       FromSuffixToWords = dict()    # key - suffix
#       for suffix in suffixList:     # value - list of (word, delta) pairs for that suffix, listed by delta in descending order
#           FromSuffixToWords[suffix] = []
#
#       for key, delta in WordListOrderedByDelta:
#           this_suffix = SuffixedCandidateDict[key].m_Subwords[1]
#           FromSuffixToWords[this_suffix].append((key, delta))
#
#
#       # FOR EACH SUFFIX, SET A THRESHOLD BASED ON DELTA PROPERTIES FOR WHICH OF ITS
#       # ASSOCIATED WORDS SHOULD GO INTO THE LEXICON
#       Cutoff = dict()  # Key: proposed suffix
#                        # Value: minimum delta for a word in its list to qualify for the lexicon
#
#       # These parameters must be set; so far they look OK for English.
#       DeltaTotalLimit = 1
#       PosNegRatioLimit = 1.9
#       ROILimit = .1
#       DefaultWordDeltaLimit = 1    # This is the standard expectation for words formed on a real suffix.
#
#       for suffix in FromSuffixToWords:
#           if DeltaTotal[suffix] < DeltaTotalLimit:    # Clearly a false suffix! Don't admit any word from its list.
#               Cutoff[suffix] = sys.maxsize
#
#           elif PosNegRatio[suffix] < PosNegRatioLimit:    # May or may not be a real suffix, but likely that many items
#               Cutoff[suffix] = sys.maxsize        # in its list are not real stem+suffix formations.
#
#           elif ROI[suffix] < ROILimit:            # similar to preceding case
#               Cutoff[suffix] = sys.maxsize
#
#           elif (CountPosDeltas[suffix] < CountNegDeltas[suffix] and \
#               CountNegDeltas[suffix] < CountZeroDeltas[suffix]):     # Probably a real suffix, but this property indicates
#               Cutoff[suffix] = DefaultWordDeltaLimit + 1             # that more bad formations occur.
#
#           else:
#               Cutoff[suffix] = DefaultWordDeltaLimit  # A real, well-behaved suffix!
#
#       if True:
#           print >> outfile, "Cutoff values for suffixed words:"
#           for suffix in suffixList:
#               print >> outfile, "%8s  %8s" % \
#               (suffix, '{:,}'.format(Cutoff[suffix]))
#
#
#       # THIS IS THE GOAL OF ALL THE PRECEDING WORK
#       # Add qualified suffixed candidates to the lexicon
#       NomineeList = list()
#       latex_data= list()
#       latex_data.append("piece   count   subword    subword")
#
#       for suffix in FromSuffixToWords:
#           for (word, delta) in FromSuffixToWords[suffix]:
#               if delta >= Cutoff[suffix]:
#                   admitted_entry = self.AddEntry(word, temp_lexicon.m_EntryDict[word])   # Use expected count for this initial, pre-parse entry
#                   NomineeList.append((word, admitted_entry))
#
#                   # for the trace
#                   admitted_entry.m_CountRegister.append((current_iteration, admitted_entry.m_ParseCount, 0, []))  # as in UpdateRegister()
#                   for subword in admitted_entry.m_Subwords:
#                       self.m_EntryDict[subword].m_ReprCount += 1
#                               self.m_EntryDict[subword].m_Extwords.append(word)
#                               self.m_EntryDict[subword].m_NewExtwords.append(word)  #will go into this word's m_CountRegister to display along with changed counts
#
#                   # for display
#                       expr = ""
#                       for subword in admitted_entry.m_Subwords:
#                           expr = expr + "%12s" % (subword)
#                   print "%22s %8s %8s   %-50s" % (word,  '{:,}'.format(delta), '{:,}'.format(admitted_entry.m_ParseCount), expr)
#                   latex_data.append(word +  "\t" + '{:,} {}'.format(admitted_entry.m_ParseCount, expr) )
#                   # DESIRABLE TO DISPLAY DELTA -- NEED HELP MAKING IT WORK!
#
#               else:
#                   break    # deltas are descending, so go on to the next suffix
#
#       MakeLatexTable(latex_data,outfile)
#
#       return NomineeList      # NOTE THAT THE RETURN VALUE IS NOT USED  # July 18, 2015 - Now used to update DictionaryCost
#
#
# ---------------------------------------------------------#
    def Expectation(self):
        self.m_NumberOfHypothesizedRunningWords = 0
        for this_line in self.m_Corpus:
            wordlength = len(this_line)
            ForwardProb = dict()
            BackwardProb = dict()
            Forward(this_line,ForwardProb)
            Backward(this_line,BackwardProb)
            this_word_prob = BackwardProb[0]

            if WordProb > 0:
                for nPos in range(wordlength):
                    for End in range(nPos, wordlength-1):
                        if End- nPos + 1 > self.m_SizeOfLongestEntry:
                            continue
                        if nPos == 0 and End == wordlength - 1:
                            continue
                        Piece = this_line[nPos, End+1]
                        if Piece in self.m_EntryDict:
                            this_entry = self.m_EntryDict[Piece]
                            CurrentIncrement = ((ForwardProb[nPos] * BackwardProb[End+1])* this_entry.m_Frequency ) / WordProb
                            this_entry.m_Count += CurrentIncrement
                            self.m_NumberOfHypothesizedRunningWords += CurrentIncrement


# ---------------------------------------------------------#
    def Maximization(self):
        for entry in self.m_EntryDict:
            entry.m_Frequency = entry.m_Count / self.m_NumberOfHypothesizedRunningWords

# ---------------------------------------------------------#
    def Forward (self, this_line,ForwardProb):
        ForwardProb[0]=1.0
        for Pos in range(1,Length+1):
            ForwardProb[Pos] = 0.0
            if (Pos - i > self.m_SizeOfLongestEntry):
                break
            Piece = this_line[i,Pos+1]
            if Piece in self.m_EntryDict:
                this_Entry = self.m_EntryDict[Piece]
                vlProduct = ForwardProb[i] * this_Entry.m_Frequency
                ForwardProb[Pos] = ForwardProb[Pos] + vlProduct
        return ForwardProb

# ---------------------------------------------------------#
    def Backward(self, this_line,BackwardProb):

        Last = len(this_line) -1
        BackwardProb[Last+1] = 1.0
        for Pos in range( Last, Pos >= 0,-1):
            BackwardProb[Pos] = 0
            for i in range(Pos, i <= Last,-1):
                if i-Pos +1 > m_SizeOfLongestEntry:
                    Piece = this_line[Pos, i+1]
                    if Piece in self.m_EntryDict[Piece]:
                        this_Entry = self.m_EntryDict[Piece]
                        if this_Entry.m_Frequency == 0.0:
                            continue
                        vlProduct = BackwardProb[i+1] * this_Entry.m_Frequency
                        BackwardProb[Pos] += vlProduct
        return BackwardProb


# ---------------------------------------------------------#
    def PrintLexicon(self, outfile):
        print("\n\nLEXICON with trace information", file=outfile)
        for key in sorted(self.m_EntryDict.keys()):
            self.m_EntryDict[key].Display(outfile)

        print("\n\nDELETIONS", file=outfile)
        for key in self.m_DeletionDict:
            self.m_DeletionDict[key].Display(outfile)

# ---------------------------------------------------------#
    def RecallPrecision(self, iteration_number, outfile):

        # the following calculations are precision and recall *for breaks* (not for morphemes)
        total_true_positive_for_break = 0

        for linenumber in range(len(self.m_BreakPointList)):
            truth = self.m_BreakPointList[linenumber]
            if len(truth) < 2:     #NOTE There are 10 such lines in brown-english.txt
                print("Skipping this line:", self.m_Corpus[linenumber], file=outfile)
                continue
            hypothesis = list()
            hypothesis_line_length = 0
            for piece in self.m_ParsedCorpus[linenumber]:    # NOTE audrey 2015_08_23
                hypothesis_line_length += len(piece)         #  This could be constructed in ParseCorpus() and stored in Lexicon.
                hypothesis.append(hypothesis_line_length)    #  Analogous to m_BreakPointList  (and m_NumberOfTrueRunningWords).
            #number_of_hypothesized_words = len(hypothesis)  #  (We already store m_NumberOfHypothesizedRunningWords.)

            true_positive_for_break = len(set(hypothesis).intersection(set(truth)))
            #if linenumber == 86:
                #print("For linenumber ", linenumber, " len(truth) =", len(truth), "BUT true_positive_for_break =", true_positive_for_break)
                #print("truth is ", truth)
                #print("hypothesis is ", hypothesis)
            total_true_positive_for_break += true_positive_for_break

        total_break_precision = float(total_true_positive_for_break) /  self.m_NumberOfHypothesizedRunningWords
        total_break_recall    = float(total_true_positive_for_break) /  self.m_NumberOfTrueRunningWords
        self.m_Break_based_RecallPrecisionHistory.append((iteration_number,  total_break_precision,total_break_recall))

        formatstring = "%16s %12s %6.4f %9s %6.4f"
        print()
        print(formatstring %( "Break based word", "precision", total_break_precision, "recall", total_break_recall))
        print(formatstring %( "Break based word", "precision", total_break_precision, "recall", total_break_recall), file=outfile)


        # Token_based precision for word discovery:

        if (True):
            true_positives = 0
            for (word, this_words_entry) in self.m_EntryDict.items():
                if word in self.m_TrueDictionary:
                    true_count = self.m_TrueDictionary[word]
                    these_true_positives = min(true_count, this_words_entry.m_Count)
                else:
                    these_true_positives = 0
                true_positives += these_true_positives

            word_recall = float(true_positives) / self.m_NumberOfTrueRunningWords
            word_precision = float(true_positives) / self.m_NumberOfHypothesizedRunningWords
            self.m_Token_based_RecallPrecisionHistory.append((iteration_number,  word_precision,word_recall))

            print(formatstring %( "Token_based word", "precision", word_precision, "recall", word_recall), file=outfile)
            print(formatstring %( "Token_based word", "precision", word_precision, "recall", word_recall))


        # Type_based precision for word discovery:

        if (True):
            true_positives = 0
            for (word, this_words_entry) in self.m_EntryDict.items():
                if word in self.m_TrueDictionary:
                    true_positives +=1

            word_recall = float(true_positives) / len(self.m_TrueDictionary)
            word_precision = float(true_positives) / len(self.m_EntryDict)
            self.m_Type_based_RecallPrecisionHistory.append((iteration_number,  word_precision,word_recall))

            #print >>outfile, "\n\n***\n"
            #print "Type_based Word Precision  %6.4f; Word Recall  %6.4f" %(word_precision ,word_recall)
            print(formatstring %( " Type_based word", "precision", word_precision, "recall", word_recall), file=outfile)
            print(formatstring %( " Type_based word", "precision", word_precision, "recall", word_recall))


# ---------------------------------------------------------#
    def RecallPrecision_do_not_use(self, iteration_number, outfile):      # break-based calculation is simplified in version above

        print("Now at top of RecallPrecision function")
        print("linenumber = ", iteration_number)            #just to get a few specific lines - has nothing to do with iteration itself
        print("Here is self.m_BreakPointList[linenumber]    ", self.m_BreakPointList[iteration_number])
        print("Its length is ", len(self.m_BreakPointList[iteration_number]))
        truth = list(self.m_BreakPointList[iteration_number])
        print("Here is truth   ", truth)
        print("Length of truth is ", len(truth))
        print("\n\n")

        total_true_positive_for_break = 0
        total_number_of_hypothesized_words = 0
        total_number_of_true_words = 0
        for linenumber in range(len(self.m_BreakPointList)):
            truth = list(self.m_BreakPointList[linenumber])
            if len(truth) < 2:
                print("Skipping this line:", self.m_Corpus[linenumber], file=outfile)
                continue
            #number_of_true_words = len(truth) -1   #No--For the beginning line, for example, there are 23 words, counting '.' as a separate word. And len(truth) = 23.
            number_of_true_words = len(truth)
            hypothesis = list()
            hypothesis_line_length = 0
            accurate_word_discovery = 0
            true_positive_for_break = 0
            word_too_big = 0
            word_too_small = 0
            real_word_lag = 0
            hypothesis_word_lag = 0

            for piece in self.m_ParsedCorpus[linenumber]:    # NOTE audrey 2015_08_23
                hypothesis_line_length += len(piece)         #  This could be constructed in ParseCorpus() and stored in Lexicon.
                hypothesis.append(hypothesis_line_length)    #  We already store m_NumberOfHypothesizedRunningWords.
            number_of_hypothesized_words = len(hypothesis)   #  Analogous to m_BreakPointList  and  m_NumberOfTrueRunningWords.
            if linenumber == iteration_number:
                print("hypothesis = ", hypothesis)

            # state 0: at the last test, the two parses were in agreement
            # state 1: at the last test, truth was # and hypothesis was not
            # state 2: at the last test, hypothesis was # and truth was not
            pointer = 0
            state = 0
            while (len(truth) > 0 and len(hypothesis) > 0):

                next_truth = truth[0]
                next_hypothesis  = hypothesis[0]
                if state == 0:
                    real_word_lag = 0
                    hypothesis_word_lag = 0

                    if next_truth == next_hypothesis:
                        pointer = truth.pop(0)
                        hypothesis.pop(0)
                        true_positive_for_break += 1
                        accurate_word_discovery += 1
                        state = 0
                    elif next_truth < next_hypothesis:
                        pointer = truth.pop(0)
                        real_word_lag += 1
                        state = 1
                    else: #next_hypothesis < next_truth:
                        pointer = hypothesis.pop(0)
                        hypothesis_word_lag = 1
                        state = 2
                elif state == 1:
                    if next_truth == next_hypothesis:
                        pointer = truth.pop(0)
                        hypothesis.pop(0)
                        true_positive_for_break += 1
                        word_too_big += 1
                        state = 0
                    elif next_truth < next_hypothesis:
                        pointer = truth.pop(0)
                        real_word_lag += 1
                        state = 1 #redundantly
                    else:
                        pointer = hypothesis.pop(0)
                        hypothesis_word_lag += 1
                        state = 2
                else: #state = 2
                    if next_truth == next_hypothesis:
                        pointer = truth.pop(0)
                        hypothesis.pop(0)
                        true_positive_for_break += 1
                        word_too_small +=1
                        state = 0
                    elif next_truth < next_hypothesis:
                        pointer = truth.pop(0)
                        real_word_lag += 1
                        state = 1
                    else:
                        pointer = hypothesis.pop(0)
                        hypothesis_word_lag += 1
                        state =2


            #precision = float(true_positive_for_break) /  number_of_hypothesized_words
            #recall    = float(true_positive_for_break) /  number_of_true_words

            total_true_positive_for_break += true_positive_for_break
            if linenumber == iteration_number:
                print("true_positive_for_break = ", true_positive_for_break)
            total_number_of_hypothesized_words += number_of_hypothesized_words
            total_number_of_true_words += number_of_true_words


        print("Info_RecallPrecision")
        print("iteration_number = ", iteration_number)
        print("total_number_of_true_words = ", total_number_of_true_words)
        print("self.m_NumberOfTrueRunningWords = ", self.m_NumberOfTrueRunningWords)
        print("total_number_of_hypothesized_words = ", total_number_of_hypothesized_words)
        print("self.m_NumberOfHypothesizedRunningWords = ", self.m_NumberOfHypothesizedRunningWords)


        # the following calculations are precision and recall *for breaks* (not for morphemes)

        formatstring = "%16s %12s %6.4f %9s %6.4f"
        total_break_precision = float(total_true_positive_for_break) /  total_number_of_hypothesized_words
        total_break_recall    = float(total_true_positive_for_break) /  total_number_of_true_words

        ### WHY HERE ????
        #self.m_CorpusCostHistory.append( self.m_CorpusCost)   MOVED TO Report()  August 21, 2015

        self.m_Break_based_RecallPrecisionHistory.append((iteration_number,  total_break_precision,total_break_recall))
        print(formatstring %( "Break based word", "precision", total_break_precision, "recall", total_break_recall))
        print(formatstring %( "Break based word", "precision", total_break_precision, "recall", total_break_recall), file=outfile)


        # Token_based precision for word discovery:

        if (True):
            true_positives = 0
            for (word, this_words_entry) in self.m_EntryDict.items():
                if word in self.m_TrueDictionary:
                    true_count = self.m_TrueDictionary[word]
                    these_true_positives = min(true_count, this_words_entry.m_Count)
                else:
                    these_true_positives = 0
                true_positives += these_true_positives
            word_recall = float(true_positives) / self.m_NumberOfTrueRunningWords
            word_precision = float(true_positives) / self.m_NumberOfHypothesizedRunningWords
            self.m_Token_based_RecallPrecisionHistory.append((iteration_number,  word_precision,word_recall))

            print(formatstring %( "Token_based Word Precision", word_precision, "recall", word_recall), file=outfile)
            print(formatstring %( "Token_based Word Precision", word_precision, "recall", word_recall))


        # Type_based precision for word discovery:
        if (True):
            true_positives = 0
            for (word, this_words_entry) in self.m_EntryDict.items():
                if word in self.m_TrueDictionary:
                    true_positives +=1
            word_recall = float(true_positives) / len(self.m_TrueDictionary)
            word_precision = float(true_positives) / len(self.m_EntryDict)
            self.m_Type_based_RecallPrecisionHistory.append((iteration_number,  word_precision,word_recall))

            #print >>outfile, "\n\n***\n"
#            print "Type_based Word Precision  %6.4f; Word Recall  %6.4f" %(word_precision ,word_recall)
            print(formatstring %( " Type_based word", "precision", word_precision, "recall", word_recall), file=outfile)
            print(formatstring %( " Type_based word", "precision", word_precision, "recall", word_recall))

# ---------------------------------------------------------#
    def PrintRecallPrecision(self, itarget, outfile):
        print("\t\t\tBreak\t\tToken-based\t\tType-based", file=outfile)
        print("\t\t\tprecision\trecall\tprecision\trecall\tprecision\trecall", file=outfile)
        for iterno in range(itarget+1):      #was  (range(numberofcycles-1)):
        #for iterno in range(1, numberofcycles+1):  #for the 3 histories, may not be consistent whether start at 0 or 1
            print("printing iterno", iterno)
            (iteration, p1,r1) = self.m_Break_based_RecallPrecisionHistory[iterno]
            (iteration, p2,r2) = self.m_Token_based_RecallPrecisionHistory[iterno]
            (iteration, p3,r3) = self.m_Type_based_RecallPrecisionHistory[iterno]
            cost1 = self.m_DictionaryCostHistory[iterno]  # was int(self.m_DictionaryCostHistory[iterno])
            cost2 = self.m_CorpusCostHistory[iterno]      # was int(self.m_CorpusCostHistory[iterno] )
            #print >>outfile,"%3i\t%8.3f\t%8.3f\t%8.3f\t%8.3f\t%8.3f\t%8.3f" %(iteration, r1,p1,r2,p2,r3,p3)
            print(iteration,"\t",cost1, "\t", cost2, "\t", p1,"\t",r1,"\t",p2,"\t",r2,"\t",p3,"\t",r3, file=outfile)

# ---------------------------------------------------------#
    def ExtendLexicon(self, method, iteration_number, howmany, outfile):
        #TempDeletionList = self.FilterZeroCountEntries(iteration_number)  # DON"T NEED TempDeletionList
        if method == "standard":
            NomineeList = self.GenerateCandidates_standardmethod(iteration_number, howmany, outfile)
        #elif method == "suffix": # if no method specified, no new candidates will be generated
            #NomineeList = self.GenerateCandidates_suffixmethod()

        self.FilterZeroCountEntries(outfile)  # MOVED DOWNWARD.
        self.ComputeDictFrequencies()   # uses Count info from previous parse and GenerateCandidates()

        # COMPUTE DICTIONARY COST - composition-based
        #self.m_LexiconCost = self.m_InitialLexCost
        #for key, entry in self.m_EntryDict.items():
            #self.m_LexiconCost += entry.m_ReprCount *  -1 * math.log(entry.m_Frequency, 2)

        # COMPUTE DICTIONARY COST - letter-based
        #print "Now make sure m_LetterPlog is OK"
        #for key, entry in self.m_LetterPlog.items():
        #   print "key = ", key, "    plog = ", self.m_LetterPlog[key]

        #print "Now in ExtendLexicon: Compute the dictionary cost  letter-based"  # USE THE FUNCTION INSTEAD?
        self.m_DictionaryCost = 0.0
        for word in self.m_EntryDict:
            #print "word = ", word
            letters = list(word)
            for letter in letters:
                self.m_DictionaryCost += self.m_LetterPlog[letter]

        # UPDATE DICTIONARY COST  July 19, 2015: For iters 20, 21, 22, the whole-dictionary calculation is faster
        #for word in TempDeletionList:
        #   letters = list(word)
        #   for letter in letters:
        #       self.m_DictionaryCost -= self.m_LetterPlog[letter]

        #for (word, entry) in NomineeList:
        #   letters = list(word)
        #   for letter in letters:
        #       self.m_DictionaryCost += self.m_LetterPlog[letter]

        #self.m_DictionaryCostHistory.append(self.m_DictionaryCost)   #moved to Report()

# ---------------------------------------------------------#
    def Report(self, iteration_number, outfile):
        # DictionaryCost is calculated in ExtendLexicon(), CorpusCost in ParseCorpus().
        print("Cost: ")
        print("-%16s" % 'Corpus: ',    "{0:18,.4f}".format(self.m_CorpusCost))
        print("-%16s" % 'Dictionary: ',   "{0:18,.4f}".format(self.m_DictionaryCost))
        print("-%16s" % 'Combined: ',  "{0:18,.4f}".format(self.m_CorpusCost + self.m_DictionaryCost))

        print("Cost: ", file=outfile)
        print("-%16s" % 'Corpus: ',    "{0:18,.4f}".format(self.m_CorpusCost), file=outfile)
        print("-%16s" % 'Dictionary: ',   "{0:18,.4f}".format(self.m_DictionaryCost), file=outfile)
        print("-%16s" % 'Combined: ',  "{0:18,.4f}".format(self.m_CorpusCost + self.m_DictionaryCost), file=outfile)

        self.m_CorpusCostHistory.append(self.m_CorpusCost)
        self.m_DictionaryCostHistory.append(self.m_DictionaryCost)
        self.RecallPrecision(iteration_number, outfile)
        self.UpdateLexEntryRegisters(iteration_number)


# ---------------------------------------------------------#
def PrintList(my_list, outfile):
    print(file=outfile)
    for item in my_list:
        print(item, end=" ", file=outfile)

# ---------------------------------------------------------#

def LoadSavedStateFromFile(statefile, ibase, itarget):
    if not (itarget > ibase):
        print("\nERROR")
        print("itarget must be an integer greater than ibase.")
        print("Exiting...\n")
        sys.exit()

    if (not os.path.isfile(statefile)):
        print("\nERROR")
        print("The designated statefile, ", statefile, ", does not exist.")
        print("Exiting...\n")
        sys.exit()

    fd = open(statefile, "r")
    lines = fd.readlines()
    fd.close()

    for line in lines:
        if "Last iteration" in line:      #this is 'in' operator
            items = line.split('=')
            iend = int(items[1])
            #print("Found  Last iteration = ", iend)
            if ibase != iend:
                print("\nERROR")
                print("Mismatch between ibase iteration =", ibase, "and last recorded iteration =", iend, "on statefile")
                #print(statefile, "has state information for iteration", str(iend)+",")
                #print("not for ibase iteration", ibase, "as required.")
                print("Exiting...\n")
                sys.exit()
            break

    print("LOADING SAVED STATE...")
    serialstr = lines[-1]
    this_lexicon = jsonpickle.decode(serialstr)
    return this_lexicon


def main(language, corpus, datafolder,
         ibase, itarget, statefile, candidatesperiteration, howmuchcorpus, verboseflag):

    # REQUIRED ARGUMENTS
    if (ibase == None or itarget == None):
        print("\nERROR")
        print("ibase and itarget must be specified, with itarget > ibase >= 0")
        print("Exiting...\n")
        sys.exit()

    if (ibase > 0 and statefile == None):
        print("\nERROR")
        print("To resume processing beyond the ibase iteration, a file specifying the previous state must be provided.")
        print("Use the '--statefile = ' command line option.")
        print("Exiting...\n")
        sys.exit()

    numberofcycles = itarget - ibase    # for convenience


    # SET UP OUTPUT FILES

    i = datetime.datetime.now()
    #timelabel = "%s_%s_%s.%s_%s" % (i.year, i.month, i.day, i.hour, i.minute)
    timelabel = i.strftime("%Y_%m_%d.%H_%M")
    iterlabel = "[%s,%s]" % (ibase+1, itarget)

    longlabel  = "wb-%s-%s" % (timelabel, iterlabel)
    shortlabel = "wb-%s" % (iterlabel)

    outfolder = Path(datafolder, language, "wordbreaking", longlabel)
    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    outfile_pathname                 = str(Path(outfolder, shortlabel + ".txt"))
    outfile_corpus_pathname          = str(Path(outfolder, shortlabel + "_brokencorpus.txt"))
    outfile_lexicon_pathname         = str(Path(outfolder, shortlabel + "_lexicon_.txt"))
    outfile_recallprecision_pathname = str(Path(outfolder, shortlabel + "_recallprecision.tsv"))
    outfile_jsonpickle_pathname      = str(Path(outfolder, longlabel  + "_jsonpickle.txt"))

    outfile                 = open(outfile_pathname, "w")
    outfile_corpus          = open(outfile_corpus_pathname, "w")
    outfile_lexicon         = open(outfile_lexicon_pathname, "w")
    outfile_recallprecision = open(outfile_recallprecision_pathname, "w")
    outfile_jsonpickle      = open(outfile_jsonpickle_pathname, "w")

    # Header for outfile
    print("# corpus = " + str(corpus), file=outfile)
    print("# state loaded from " + str(statefile), file=outfile)
    print("# state saved to " + longlabel  + "_jsonpickle.txt", file=outfile)
    print("# " + str(numberofcycles) + " cycles, ", "(first = " + str(ibase + 1) + ", last = " + str(itarget) + ")", file=outfile)
    print("# " + str(candidatesperiteration) + " candidates on each cycle.", file=outfile)
    print("# lines read from original corpus = ", howmuchcorpus, file=outfile)

    # Header for jsonpickle outfile
    print("# Corpus = " + str(corpus), file=outfile_jsonpickle)
    print("# Date = " + i.strftime("%Y_%m_%d"), file=outfile_jsonpickle)
    print("# Time = " + i.strftime("%H_%M"), file=outfile_jsonpickle)
    print("# First iteration = " + str(ibase+1), file=outfile_jsonpickle)
    print("# Last iteration = " + str(itarget), file=outfile_jsonpickle)
    print("# Notes = working on reacting to load state errors",file=outfile_jsonpickle)
    print(file=outfile_jsonpickle)

    infile_corpus_pathname = str(Path(datafolder, language, corpus))


    # INITIAL PROCESSING

    if False:
        if prev_iteration_number == 0:
            this_lexicon.ReadBrokenCorpus (corpusfilename, corpuslinestoread)
        else:
            print("LOADING SAVED STATE...")
            this_lexicon.LoadState(prev_iteration_number)  # Oops

    if (ibase == 0 and statefile == None):
        this_lexicon = Lexicon()
        this_lexicon.ReadBrokenCorpus (infile_corpus_pathname, outfile, howmuchcorpus)
        #this_lexicon.ParseCorpus (outfile, current_iteration)  # CHANGE NEEDED WHEN LOAD SAVED STATE
        #this_lexicon.ParseCorpus (outfile, 0)  # CHANGE NEEDED WHEN LOAD SAVED STATE

    else:
        this_lexicon = LoadSavedStateFromFile(statefile, ibase, itarget)


    # ITERATIVE PROCESSING

    t_start = time.time()          # 2014_07_20    #2015_07_18   TEMPORARY
    for current_iteration in range(1+ibase, 1+itarget):
        print("\n\n Iteration number " + str(current_iteration) + " (#" + str(current_iteration-ibase) + " of " + str(numberofcycles) + " for this run)")
        print("\n\n Iteration number", current_iteration, file=outfile)
        method = "standard"  #FOR NOW

        this_lexicon.ExtendLexicon(method, current_iteration, candidatesperiteration, outfile)
        this_lexicon.ParseCorpus (current_iteration, outfile)
        this_lexicon.Report(current_iteration, outfile)


    # COMPLETION

    t_end = time.time()
    elapsed = t_end - t_start
    print ("\n\nElapsed wall time in seconds = ", elapsed)
    print ("\n\nElapsed wall time in seconds = ", elapsed, file=outfile)

    this_lexicon.PrintParsedCorpus(outfile_corpus)
    this_lexicon.PrintLexicon(outfile_lexicon)
    this_lexicon.PrintRecallPrecision(itarget, outfile_recallprecision)

    print("\nSAVING CURRENT STATE to '"  +  longlabel  + "_jsonpickle.txt'")
    print("[Processing can be continued from this point in a later run ")
    print("by loading this file through the '--statefile = ' command line option.]")
    print()
    serialstr = jsonpickle.encode(this_lexicon)
    print(serialstr, file=outfile_jsonpickle)

    outfile.close()
    outfile_corpus.close()
    outfile_lexicon.close()
    outfile_recallprecision.close()
    outfile_jsonpickle.close()


if __name__ == "__main__":

    args = makeArgParser().parse_args()

    ibase = args.ibase
    itarget = args.itarget
    statefile = args.statefile
    candidatesperiteration = args.candidates
    corpuslinestoread = args.corpuslinestoread
    verboseflag = args.verbose

    if corpuslinestoread == sys.maxsize:
        howmuchcorpus = "all"
    else:
        howmuchcorpus = str(corpuslinestoread)

    description="You are running {}.\n".format(__file__) + \
                "This program segments unbroken text into words.\n" + \
                "\tibase = {}\n".format(ibase) + \
                "\titarget = {}\n".format(itarget) + \
                "\tstatefile = {}\n".format(statefile) + \
                "\tcandidatesperiteration = {}\n".format(candidatesperiteration) + \
                "\tcorpuslinestoread = {}\n".format(howmuchcorpus) + \
                "\t_verbose = {}".format(verboseflag)

    language, corpus, datafolder = get_language_corpus_datafolder(args.language,
                                      args.corpus, args.datafolder, args.config,
                                      description=description,
                                      scriptname=__file__)

    main(language, corpus, datafolder,
         ibase, itarget, statefile, candidatesperiteration, howmuchcorpus, verboseflag)