Merge pull request #2865 from internetarchive/hotfix/2851-LCCNs

Correctly import LCCNs
internetarchive · Jan 21, 2020 · 5007d19 · 5007d19
2 parents cf1d395 + e8d4856
commit 5007d19
Show file tree

Hide file tree

Showing 24 changed files with 351 additions and 361 deletions.
diff --git a/openlibrary/catalog/get_ia.py b/openlibrary/catalog/get_ia.py
@@ -1,17 +1,20 @@
 from __future__ import print_function
-from openlibrary.catalog.marc.marc_binary import MarcBinary
-from openlibrary.catalog.marc.marc_xml import MarcXml
-from openlibrary.catalog.marc import fast_parse, parse
-from infogami import config
-from lxml import etree
-import xml.parsers.expat
+
 import os.path
 import socket
-from time import sleep
 import traceback
-from openlibrary.core import ia
+import xml.parsers.expat
 
+from infogami import config
+from lxml import etree
 from six.moves import urllib
+from time import sleep
+
+from openlibrary.catalog.marc.marc_binary import MarcBinary
+from openlibrary.catalog.marc.marc_xml import MarcXml
+from openlibrary.catalog.marc.parse import read_edition
+from openlibrary.catalog.marc.fast_parse import read_file as fast_read_file  # Deprecated import
+from openlibrary.core import ia
 
 
 IA_BASE_URL = config.get('ia_base_url')
@@ -83,7 +86,7 @@ def get_ia(identifier):
     :rtype: dict
     """
     marc = get_marc_record_from_ia(identifier)
-    return parse.read_edition(marc)
+    return read_edition(marc)
 
 def files(identifier):
     url = item_file_url(identifier, 'files.xml')
@@ -174,7 +177,7 @@ def read_marc_file(part, f, pos=0):
     :rtype: (int, str, str)
     :return: (Next position, Current source_record name, Current single MARC record)
     """
-    for data, int_length in fast_parse.read_file(f):
+    for data, int_length in fast_read_file(f):
         loc = "marc:%s:%d:%d" % (part, pos, int_length)
         pos += int_length
         yield (pos, loc, data)

diff --git a/openlibrary/catalog/marc/build_record.py b/openlibrary/catalog/marc/build_record.py
@@ -1,8 +1,12 @@
-from __future__ import print_function
-from fast_parse import *
+""" This entire module is deprecated,
+    openlibrary.catalog.marc.parse is the preferred module
+"""
+
+import re
+from deprecated import deprecated
 from warnings import warn
+
 from openlibrary.catalog.utils import pick_first_date
-import re
 
 re_question = re.compile('^\?+$')
 re_lccn = re.compile('(...\d+).*')
@@ -26,7 +30,7 @@
     '035', # oclc
     '050', # lc classification
     '082', # dewey
-    '100', '110', '111', # authors TODO
+    '100', '110', '111', # authors
     '130', '240', # work title
     '245', # title
     '250', # edition
@@ -42,11 +46,12 @@
 ]
 
 re_series = re.compile('^(.*) series$', re.I)
+REASON = 'Use corresponding function in openlibrary.catalog.marc.parse instead.'
 
+@deprecated(REASON)
 def read_lccn(fields):
     if '010' not in fields:
         return {}
-
     found = []
     for line in fields['010']:
         for k, v in get_subfields(line, ['a']):
@@ -62,6 +67,7 @@ def read_lccn(fields):
 
     return {'lccn': found}
 
+@deprecated(REASON)
 def read_isbn(fields):
     if '020' not in fields:
         return {}
@@ -84,9 +90,9 @@ def read_isbn(fields):
             ret.setdefault('isbn_13', []).append(i)
         else:
             ret.setdefault('isbn_10', []).append(i)
-
     return ret
 
+@deprecated(REASON)
 def read_oclc(fields):
     if '035' not in fields:
         return {}
@@ -102,6 +108,7 @@ def read_oclc(fields):
                 found.append(oclc)
     return {'oclc_number': found } if found else {}
 
+@deprecated(REASON)
 def read_author_person(line):
     author = {}
     contents = get_contents(line, ['a', 'b', 'c', 'd'])
@@ -129,14 +136,12 @@ def read_author_person(line):
         author['fuller_name'] = ' '.join(contents['q'])
     return author
 
+@deprecated(REASON)
 def read_authors(fields):
     found = []
     author = [tag for tag in fields if tag in ('100', '110', '111')]
     if len(author) == 0:
         return {}
-    if len(author) != 1:
-        for tag in ('100', '110', '111'):
-            print(tag, fields[tag])
     assert len(author) == 1
     if '100' in fields:
         line = fields['100'][0]
@@ -152,6 +157,7 @@ def read_authors(fields):
 
     return {'authors': [author]} if author else {}
 
+@deprecated(REASON)
 def read_title(fields):
     if '245' not in fields:
         return {}
@@ -180,6 +186,7 @@ def read_title(fields):
         edition["physical_format"] = ' '.join(contents['h'])
     return edition
 
+@deprecated(REASON)
 def read_lc_classification(fields):
     if '050' not in fields:
         return {}
@@ -201,6 +208,7 @@ def read_lc_classification(fields):
     else:
         return {}
 
+@deprecated(REASON)
 def read_dewey(fields):
     if '082' not in fields:
         return {}
@@ -209,9 +217,11 @@ def read_dewey(fields):
         found += get_subfield_values(line, ['a'])
     return {'dewey_decimal_class': found }
 
+@deprecated(REASON)
 def join_subfield_values(line, subfields):
     return ' '.join(get_subfield_values(line, subfields))
 
+@deprecated(REASON)
 def read_work_titles(fields):
     found = []
     if '240' in fields:
@@ -228,6 +238,7 @@ def read_work_titles(fields):
 
     return { 'work_titles': found } if found else {}
 
+@deprecated(REASON)
 def read_edition_name(fields):
     if '250' not in fields:
         return {}
@@ -236,6 +247,7 @@ def read_edition_name(fields):
         found += [v for k, v in get_all_subfields(line)]
     return {'edition_name': ' '.join(found)}
 
+@deprecated(REASON)
 def read_publisher(fields):
     if '260' not in fields:
         return {}
@@ -254,6 +266,7 @@ def read_publisher(fields):
         edition["publish_places"] = publish_place
     return edition
 
+@deprecated(REASON)
 def read_pagination(fields):
     if '300' not in fields:
         return {}
@@ -273,6 +286,7 @@ def read_pagination(fields):
             edition["number_of_pages"] = max(valid)
     return edition
 
+@deprecated(REASON)
 def read_series(fields):
     found = []
     for tag in ('440', '490', '830'):
@@ -296,6 +310,7 @@ def read_series(fields):
                     found.append(s)
     return {'series': found} if found else {}
 
+@deprecated(REASON)
 def read_contributions(fields):
     want = [
         ('700', 'abcde'),
@@ -311,13 +326,15 @@ def read_contributions(fields):
             found.append(join_subfield_values(line, subfields))
     return { 'contributions': found } if found else {}
 
+@deprecated(REASON)
 def remove_duplicates(seq):
     u = []
     for x in seq:
         if x not in u:
             u.append(x)
     return u
 
+@deprecated(REASON)
 def read_subjects(fields):
     want = [
         ('600', 'abcd'),
@@ -340,6 +357,7 @@ def read_subjects(fields):
 
     return {'subjects': found} if found else {}
 
+@deprecated(REASON)
 def read_genres(fields):
     found = []
     for tag in '600', '650', '651':
@@ -349,6 +367,7 @@ def read_genres(fields):
             found += get_subfield_values(line, ['v'])
     return { 'genres': remove_duplicates(found) } if found else {}
 
+@deprecated(REASON)
 def read_translation(fields):
     tag = '500'
     if tag not in fields:
@@ -364,22 +383,20 @@ def read_translation(fields):
             return { 'translation_of': m.group(2) }
     return {}
 
+@deprecated(REASON)
 def read_notes(fields):
     found = []
     for tag in range(500,590):
         if tag in (505, 520) or str(tag) not in fields:
             continue
         tag = str(tag)
         for line in fields[tag]:
-            try:
-                x = get_lower_subfields(line)
-            except IndexError:
-                print(repr(line))
-                raise
+            x = get_lower_subfields(line)
             if x:
                 found.append(' '.join(x))
     return {'notes': '\n\n'.join(found)} if found else {}
 
+@deprecated(REASON)
 def read_toc(fields):
     if '505' not in fields:
         return {}
@@ -395,7 +412,6 @@ def read_toc(fields):
                 # http://openlibrary.org/show-marc/marc_miami_univ_ohio/allbibs0036.out:3918815:7321
                 if any(len(i) > 2048 for i in toc_split):
                     toc_split = [i.strip() for i in v.split('; ')]
-                # FIXME:
                 # http://openlibrary.org/show-marc/marc_western_washington_univ/wwu_bibs.mrc_revrev.mrc:938969487:3862
                 if any(len(i) > 2048 for i in toc_split):
                     toc_split = [i.strip() for i in v.split(' / ')]
@@ -422,17 +438,14 @@ def read_toc(fields):
             found.append(i)
     return { 'table_of_contents': found }
 
+@deprecated(REASON)
 def read_description(fields):
     if '520' not in fields:
         return {}
     found = []
     wrap = False
     for line in fields['520']:
         this = get_subfield_values(line, ['a'])
-        if len(this) != 1:
-#            print repr(fields['520'])
-#            print repr(line)
-            print(len(this))
         assert len(this) == 1
         found += this
         if line[-3:-1] == '++':
@@ -441,6 +454,7 @@ def read_description(fields):
             wrap = False
     return {'description': "\n\n".join(found) } if found else {}
 
+@deprecated(REASON)
 def read_other_titles(fields):
     found = []
 
@@ -464,6 +478,7 @@ def read_other_titles(fields):
 
     return {"other_titles": found} if found else {}
 
+@deprecated(REASON)
 def read_location(fields):
     if '852' not in fields:
         return {}
@@ -472,6 +487,7 @@ def read_location(fields):
         found += [v for v in get_subfield_values(line, ['a']) if v]
     return { 'location': found } if found else {}
 
+@deprecated(REASON)
 def read_url(fields):
     if '856' not in fields:
         return {}
@@ -480,6 +496,7 @@ def read_url(fields):
         found += get_subfield_values(line, ['u'])
     return { 'uri': found } if found else {}
 
+@deprecated(REASON)
 def build_record(data):
     fields = {}
     for tag, line in handle_wrapped_lines(get_tag_lines(data, want)):