Skip to content

Commit

Permalink
Merge pull request #2865 from internetarchive/hotfix/2851-LCCNs
Browse files Browse the repository at this point in the history
Correctly import LCCNs
  • Loading branch information
mekarpeles authored Jan 21, 2020
2 parents cf1d395 + e8d4856 commit 5007d19
Show file tree
Hide file tree
Showing 24 changed files with 351 additions and 361 deletions.
23 changes: 13 additions & 10 deletions openlibrary/catalog/get_ia.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
from __future__ import print_function
from openlibrary.catalog.marc.marc_binary import MarcBinary
from openlibrary.catalog.marc.marc_xml import MarcXml
from openlibrary.catalog.marc import fast_parse, parse
from infogami import config
from lxml import etree
import xml.parsers.expat

import os.path
import socket
from time import sleep
import traceback
from openlibrary.core import ia
import xml.parsers.expat

from infogami import config
from lxml import etree
from six.moves import urllib
from time import sleep

from openlibrary.catalog.marc.marc_binary import MarcBinary
from openlibrary.catalog.marc.marc_xml import MarcXml
from openlibrary.catalog.marc.parse import read_edition
from openlibrary.catalog.marc.fast_parse import read_file as fast_read_file # Deprecated import
from openlibrary.core import ia


IA_BASE_URL = config.get('ia_base_url')
Expand Down Expand Up @@ -83,7 +86,7 @@ def get_ia(identifier):
:rtype: dict
"""
marc = get_marc_record_from_ia(identifier)
return parse.read_edition(marc)
return read_edition(marc)

def files(identifier):
url = item_file_url(identifier, 'files.xml')
Expand Down Expand Up @@ -174,7 +177,7 @@ def read_marc_file(part, f, pos=0):
:rtype: (int, str, str)
:return: (Next position, Current source_record name, Current single MARC record)
"""
for data, int_length in fast_parse.read_file(f):
for data, int_length in fast_read_file(f):
loc = "marc:%s:%d:%d" % (part, pos, int_length)
pos += int_length
yield (pos, loc, data)
Expand Down
55 changes: 36 additions & 19 deletions openlibrary/catalog/marc/build_record.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from __future__ import print_function
from fast_parse import *
""" This entire module is deprecated,
openlibrary.catalog.marc.parse is the preferred module
"""

import re
from deprecated import deprecated
from warnings import warn

from openlibrary.catalog.utils import pick_first_date
import re

re_question = re.compile('^\?+$')
re_lccn = re.compile('(...\d+).*')
Expand All @@ -26,7 +30,7 @@
'035', # oclc
'050', # lc classification
'082', # dewey
'100', '110', '111', # authors TODO
'100', '110', '111', # authors
'130', '240', # work title
'245', # title
'250', # edition
Expand All @@ -42,11 +46,12 @@
]

re_series = re.compile('^(.*) series$', re.I)
REASON = 'Use corresponding function in openlibrary.catalog.marc.parse instead.'

@deprecated(REASON)
def read_lccn(fields):
if '010' not in fields:
return {}

found = []
for line in fields['010']:
for k, v in get_subfields(line, ['a']):
Expand All @@ -62,6 +67,7 @@ def read_lccn(fields):

return {'lccn': found}

@deprecated(REASON)
def read_isbn(fields):
if '020' not in fields:
return {}
Expand All @@ -84,9 +90,9 @@ def read_isbn(fields):
ret.setdefault('isbn_13', []).append(i)
else:
ret.setdefault('isbn_10', []).append(i)

return ret

@deprecated(REASON)
def read_oclc(fields):
if '035' not in fields:
return {}
Expand All @@ -102,6 +108,7 @@ def read_oclc(fields):
found.append(oclc)
return {'oclc_number': found } if found else {}

@deprecated(REASON)
def read_author_person(line):
author = {}
contents = get_contents(line, ['a', 'b', 'c', 'd'])
Expand Down Expand Up @@ -129,14 +136,12 @@ def read_author_person(line):
author['fuller_name'] = ' '.join(contents['q'])
return author

@deprecated(REASON)
def read_authors(fields):
found = []
author = [tag for tag in fields if tag in ('100', '110', '111')]
if len(author) == 0:
return {}
if len(author) != 1:
for tag in ('100', '110', '111'):
print(tag, fields[tag])
assert len(author) == 1
if '100' in fields:
line = fields['100'][0]
Expand All @@ -152,6 +157,7 @@ def read_authors(fields):

return {'authors': [author]} if author else {}

@deprecated(REASON)
def read_title(fields):
if '245' not in fields:
return {}
Expand Down Expand Up @@ -180,6 +186,7 @@ def read_title(fields):
edition["physical_format"] = ' '.join(contents['h'])
return edition

@deprecated(REASON)
def read_lc_classification(fields):
if '050' not in fields:
return {}
Expand All @@ -201,6 +208,7 @@ def read_lc_classification(fields):
else:
return {}

@deprecated(REASON)
def read_dewey(fields):
if '082' not in fields:
return {}
Expand All @@ -209,9 +217,11 @@ def read_dewey(fields):
found += get_subfield_values(line, ['a'])
return {'dewey_decimal_class': found }

@deprecated(REASON)
def join_subfield_values(line, subfields):
return ' '.join(get_subfield_values(line, subfields))

@deprecated(REASON)
def read_work_titles(fields):
found = []
if '240' in fields:
Expand All @@ -228,6 +238,7 @@ def read_work_titles(fields):

return { 'work_titles': found } if found else {}

@deprecated(REASON)
def read_edition_name(fields):
if '250' not in fields:
return {}
Expand All @@ -236,6 +247,7 @@ def read_edition_name(fields):
found += [v for k, v in get_all_subfields(line)]
return {'edition_name': ' '.join(found)}

@deprecated(REASON)
def read_publisher(fields):
if '260' not in fields:
return {}
Expand All @@ -254,6 +266,7 @@ def read_publisher(fields):
edition["publish_places"] = publish_place
return edition

@deprecated(REASON)
def read_pagination(fields):
if '300' not in fields:
return {}
Expand All @@ -273,6 +286,7 @@ def read_pagination(fields):
edition["number_of_pages"] = max(valid)
return edition

@deprecated(REASON)
def read_series(fields):
found = []
for tag in ('440', '490', '830'):
Expand All @@ -296,6 +310,7 @@ def read_series(fields):
found.append(s)
return {'series': found} if found else {}

@deprecated(REASON)
def read_contributions(fields):
want = [
('700', 'abcde'),
Expand All @@ -311,13 +326,15 @@ def read_contributions(fields):
found.append(join_subfield_values(line, subfields))
return { 'contributions': found } if found else {}

@deprecated(REASON)
def remove_duplicates(seq):
u = []
for x in seq:
if x not in u:
u.append(x)
return u

@deprecated(REASON)
def read_subjects(fields):
want = [
('600', 'abcd'),
Expand All @@ -340,6 +357,7 @@ def read_subjects(fields):

return {'subjects': found} if found else {}

@deprecated(REASON)
def read_genres(fields):
found = []
for tag in '600', '650', '651':
Expand All @@ -349,6 +367,7 @@ def read_genres(fields):
found += get_subfield_values(line, ['v'])
return { 'genres': remove_duplicates(found) } if found else {}

@deprecated(REASON)
def read_translation(fields):
tag = '500'
if tag not in fields:
Expand All @@ -364,22 +383,20 @@ def read_translation(fields):
return { 'translation_of': m.group(2) }
return {}

@deprecated(REASON)
def read_notes(fields):
found = []
for tag in range(500,590):
if tag in (505, 520) or str(tag) not in fields:
continue
tag = str(tag)
for line in fields[tag]:
try:
x = get_lower_subfields(line)
except IndexError:
print(repr(line))
raise
x = get_lower_subfields(line)
if x:
found.append(' '.join(x))
return {'notes': '\n\n'.join(found)} if found else {}

@deprecated(REASON)
def read_toc(fields):
if '505' not in fields:
return {}
Expand All @@ -395,7 +412,6 @@ def read_toc(fields):
# http://openlibrary.org/show-marc/marc_miami_univ_ohio/allbibs0036.out:3918815:7321
if any(len(i) > 2048 for i in toc_split):
toc_split = [i.strip() for i in v.split('; ')]
# FIXME:
# http://openlibrary.org/show-marc/marc_western_washington_univ/wwu_bibs.mrc_revrev.mrc:938969487:3862
if any(len(i) > 2048 for i in toc_split):
toc_split = [i.strip() for i in v.split(' / ')]
Expand All @@ -422,17 +438,14 @@ def read_toc(fields):
found.append(i)
return { 'table_of_contents': found }

@deprecated(REASON)
def read_description(fields):
if '520' not in fields:
return {}
found = []
wrap = False
for line in fields['520']:
this = get_subfield_values(line, ['a'])
if len(this) != 1:
# print repr(fields['520'])
# print repr(line)
print(len(this))
assert len(this) == 1
found += this
if line[-3:-1] == '++':
Expand All @@ -441,6 +454,7 @@ def read_description(fields):
wrap = False
return {'description': "\n\n".join(found) } if found else {}

@deprecated(REASON)
def read_other_titles(fields):
found = []

Expand All @@ -464,6 +478,7 @@ def read_other_titles(fields):

return {"other_titles": found} if found else {}

@deprecated(REASON)
def read_location(fields):
if '852' not in fields:
return {}
Expand All @@ -472,6 +487,7 @@ def read_location(fields):
found += [v for v in get_subfield_values(line, ['a']) if v]
return { 'location': found } if found else {}

@deprecated(REASON)
def read_url(fields):
if '856' not in fields:
return {}
Expand All @@ -480,6 +496,7 @@ def read_url(fields):
found += get_subfield_values(line, ['u'])
return { 'uri': found } if found else {}

@deprecated(REASON)
def build_record(data):
fields = {}
for tag, line in handle_wrapped_lines(get_tag_lines(data, want)):
Expand Down
Loading

0 comments on commit 5007d19

Please sign in to comment.