diff --git a/chemdataextractor/doc/table.py b/chemdataextractor/doc/table.py index 24d2635..56a974f 100644 --- a/chemdataextractor/doc/table.py +++ b/chemdataextractor/doc/table.py @@ -22,7 +22,7 @@ ExtinctionHeadingParser, FluorescenceLifetimeHeadingParser, FluorescenceLifetimeCellParser, \ ElectrochemicalPotentialHeadingParser, ElectrochemicalPotentialCellParser, IrHeadingParser, IrCellParser, \ SolventCellParser, SolventHeadingParser, SolventInHeadingParser, UvvisAbsEmiQuantumYieldHeadingParser, \ - UvvisAbsEmiQuantumYieldCellParser, MeltingPointHeadingParser, MeltingPointCellParser, TempInHeadingParser, \ + UvvisAbsEmiQuantumYieldCellParser, MeltingPointHeadingParser, MeltingPointCellParser, GlassTransitionHeadingParser, GlassTransitionCellParser, TempInHeadingParser, \ UvvisAbsDisallowedHeadingParser, UvvisEmiQuantumYieldHeadingParser, UvvisEmiQuantumYieldCellParser # TODO: Sort out the above import... import module instead from ..nlp.tag import NoneTagger @@ -50,6 +50,7 @@ class Table(CaptionedElement): (FluorescenceLifetimeHeadingParser(), FluorescenceLifetimeCellParser()), (ElectrochemicalPotentialHeadingParser(), ElectrochemicalPotentialCellParser()), (MeltingPointHeadingParser(), MeltingPointCellParser()), + (GlassTransitionHeadingParser(), GlassTransitionCellParser()), (SolventHeadingParser(), SolventCellParser()), (SolventInHeadingParser(),), (TempInHeadingParser(),) diff --git a/chemdataextractor/doc/text.py b/chemdataextractor/doc/text.py index 1bdd54f..5980f74 100644 --- a/chemdataextractor/doc/text.py +++ b/chemdataextractor/doc/text.py @@ -26,6 +26,7 @@ from ..parse.table import CaptionContextParser from ..parse.ir import IrParser from ..parse.mp import MpParser +from ..parse.tg import TgParser from ..parse.nmr import NmrParser from ..parse.uvvis import UvvisParser from ..nlp.lexicon import ChemLexicon @@ -267,7 +268,7 @@ def _repr_html_(self): class Paragraph(Text): - parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), ContextParser()] + parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), ContextParser()] def _repr_html_(self): return '
' + self.text + '
' diff --git a/chemdataextractor/model.py b/chemdataextractor/model.py index 38d1b90..abadc98 100644 --- a/chemdataextractor/model.py +++ b/chemdataextractor/model.py @@ -388,6 +388,14 @@ class MeltingPoint(BaseModel): apparatus = StringType(contextual=True) +class GlassTransition(BaseModel): + """A glass transition temperature.""" + value = StringType() + units = StringType(contextual=True) + method = StringType(contextual=True) + concentration = StringType(contextual=True) + concentration_units = StringType(contextual=True) + class QuantumYield(BaseModel): """A quantum yield measurement.""" value = StringType() @@ -437,6 +445,7 @@ class Compound(BaseModel): ir_spectra = ListType(ModelType(IrSpectrum)) uvvis_spectra = ListType(ModelType(UvvisSpectrum)) melting_points = ListType(ModelType(MeltingPoint)) + glass_transitions = ListType(ModelType(GlassTransition)) quantum_yields = ListType(ModelType(QuantumYield)) fluorescence_lifetimes = ListType(ModelType(FluorescenceLifetime)) electrochemical_potentials = ListType(ModelType(ElectrochemicalPotential)) @@ -460,6 +469,9 @@ def merge_contextual(self, other): for item in self[k]: # print('item: %s' % item) for other_item in other.get(k, []): + #RBT Problem with doi: ma301230y. Had to add following check + if (isinstance(other_item,unicode) == True): + continue # if k in {'names', 'labels'}: # # TODO: Warn attempting to merge a contextual other that contains names/labels # continue diff --git a/chemdataextractor/parse/__init__.py b/chemdataextractor/parse/__init__.py index 8aaa710..8d7a3ab 100644 --- a/chemdataextractor/parse/__init__.py +++ b/chemdataextractor/parse/__init__.py @@ -23,6 +23,7 @@ from .context import ContextParser from .ir import IrParser from .mp import MpParser +from .tg import TgParser from .nmr import NmrParser from .table import CompoundHeadingParser, SolventHeadingParser, UvvisAbsDisallowedHeadingParser, SolventInHeadingParser from .table import TempInHeadingParser, SolventCellParser, CompoundCellParser, UvvisEmiHeadingParser @@ -30,6 +31,6 @@ from .table import QuantumYieldHeadingParser, QuantumYieldCellParser, UvvisEmiCellParser, UvvisAbsCellParser from .table import ExtinctionCellParser, UvvisAbsEmiQuantumYieldHeadingParser, UvvisAbsEmiQuantumYieldCellParser from .table import UvvisEmiQuantumYieldHeadingParser, UvvisEmiQuantumYieldCellParser, FluorescenceLifetimeHeadingParser -from .table import FluorescenceLifetimeCellParser, MeltingPointHeadingParser, MeltingPointCellParser +from .table import FluorescenceLifetimeCellParser, MeltingPointHeadingParser, MeltingPointCellParser, GlassTransitionHeadingParser, GlassTransitionCellParser from .table import ElectrochemicalPotentialHeadingParser, ElectrochemicalPotentialCellParser, CaptionContextParser from .uvvis import UvvisParser diff --git a/chemdataextractor/parse/context.py b/chemdataextractor/parse/context.py index 54ad39d..e4514ae 100644 --- a/chemdataextractor/parse/context.py +++ b/chemdataextractor/parse/context.py @@ -19,7 +19,7 @@ from .common import optdelim, hyphen, slash from ..utils import first from ..parse.base import BaseParser -from ..model import Compound, QuantumYield, NmrSpectrum, UvvisSpectrum, IrSpectrum, MeltingPoint, FluorescenceLifetime +from ..model import Compound, QuantumYield, NmrSpectrum, UvvisSpectrum, IrSpectrum, MeltingPoint, GlassTransition, FluorescenceLifetime from .actions import join, merge, fix_whitespace from .cem import chemical_name from .elements import I, T, R, W, ZeroOrMore, Optional, Group, OneOrMore, Any, Not @@ -33,13 +33,17 @@ uvvis = (I('UV') + (hyphen | slash) + R('^vis(ible)?$', re.I) + Optional(R('^abs(or[bp]tion)?$')))('uvvis').add_action(join).add_action(fix_whitespace) ir = (R('^(FT-?)?IR|FT-?IS$'))('ir').add_action(join) mp = (I('melting') + I('points'))('melting_point').add_action(join) +tg = (I('glass') + I('transition') + I('temperature'))('glass_transition').add_action(join) pp = (I('photophysical') + (I('measurements') | I('properties')))('photophysical_properties').add_action(join) -measurement = Group(quantum_yield | nmr | uvvis | ir | mp | pp)('measurement') +measurement = Group(quantum_yield | nmr | uvvis | ir | mp | tg | pp)('measurement') result_noun = I('data') | I('results') | I('experiments') | I('spectra') verb = W('measured') | W('recorded') | W('collected') | W('taken') | W('acquired') | W('obtained') | W('run') | (W('carried') + W('out') | W('performed')) # | T('VBN') + +method = I('DSC') | I('TMA') | I('DTA') + I('RTL') | I('DMA') + I('DMTA') | I('dilatometer') | I('dilatometry') | I('PALS') + apparatus_type = R('^\d{2,}$') + W('MHz') brands = I('HORIBA') + I('Jobin') + I('Yvon') | I('Hitachi') | I('Bruker') | I('Cary') | I('Jeol') | I('PerkinElmer') | I('Agilent') | I('Shimadzu') | I('Varian') models = I('FluoroMax-4') | I('F-7000') | I('AVANCE') | I('Digital') | R('\d\d\d+') | I('UV–vis-NIR') | I('Mercury') | I('Avatar') | I('thermonicolet') | I('pulsed') | I('Fourier') | I('transform') @@ -60,7 +64,7 @@ standard = (ZeroOrMore(T('JJ')) + OneOrMore(T('NNP') | T('NN') | T('HYPH') | T('CD') | T('B-CM') | T('I-CM')))('standard').add_action(join).add_action(fix_whitespace) standard_phrase = (W('with') | W('using')).hide() + Optional(dt).hide() + standard + (ZeroOrMore(W('as') | dt) + Optional(T('JJ')) + I('standard')).hide() -context_phrase = Group(measurement + optdelim + Optional(result_noun).hide() + Optional(T('VBD')).hide() + ZeroOrMore(Not(verb) + Any()).hide() + verb.hide() + OneOrMore(standard_phrase | apparatus_phrase | temperature_phrase | solvent_phrase | Any().hide()))('context_phrase') +context_phrase = Group(measurement + optdelim + Optional(result_noun).hide() + Optional(T('VBD')).hide() + ZeroOrMore(Not(verb) + Any()).hide() + verb.hide() + OneOrMore(standard_phrase | apparatus_phrase | temperature_phrase | solvent_phrase | method | Any().hide()))('context_phrase') # TODO: Multiple measurements, multiple apparatus. # TODO: 'respectively' phrase @@ -81,10 +85,11 @@ def interpret(self, result, start, end): } measurement = result.xpath('./measurement/*[1]')[0] - if not measurement.tag == 'melting_point': + if not measurement.tag == 'melting_point' and not measurement.tag =='glass_transition': context['temperature'] = first(result.xpath('./temperature/value/text()')) context['temperature_units'] = first(result.xpath('./temperature/units/text()')) + if measurement.tag == 'photophysical_properties': c.quantum_yields.append(QuantumYield(**context)) c.fluorescence_lifetimes.append(FluorescenceLifetime(**context)) @@ -93,6 +98,8 @@ def interpret(self, result, start, end): c.quantum_yields.append(QuantumYield(**context)) if measurement.tag == 'melting_point': c.melting_points.append(MeltingPoint(**context)) + if measurement.tag == 'glass_transition': + c.glass_transitions.append(GlassTransition(**context)) if measurement.tag == 'nmr': c.nmr_spectra.append(NmrSpectrum(**context)) if measurement.tag == 'uvvis': diff --git a/chemdataextractor/parse/mp.py b/chemdataextractor/parse/mp.py index 8021243..a671b13 100644 --- a/chemdataextractor/parse/mp.py +++ b/chemdataextractor/parse/mp.py @@ -26,7 +26,7 @@ log = logging.getLogger(__name__) -prefix = Optional(I('a')).hide() + (R('^m\.?pt?\.?$', re.I) | I('melting') + Optional((I('point') | I('temperature')| I('range'))) | R('^m\.?$', re.I) + R('^pt?\.?$', re.I)).hide() + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about')).hide() +prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Tm') + Optional(rbrct)| R('^m\.?pt?\.?$', re.I) | I('melting') + Optional((I('point') | I('temperature')| I('range'))) | R('^m\.?$', re.I) + R('^pt?\.?$', re.I)).hide() + Optional(lbrct + W('Tm') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about')).hide() delim = R('^[:;\.,]$') @@ -38,7 +38,7 @@ to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join) temp_range = (Optional(R('^[\-–−]$')) + (joined_range | spaced_range | to_range))('value').add_action(merge) temp_value = (Optional(R('^[~∼˜\<\>]$')) + Optional(R('^[\-–−]$')) + R('^[\+\-–−]?\d+(\.\d+)?$'))('value').add_action(merge) -temp = (temp_range | temp_value)('value') +temp = Optional(lbrct).hide() + (temp_range | temp_value)('value') + Optional(rbrct).hide() mp = (prefix + Optional(delim).hide() + temp + units)('mp') @@ -52,7 +52,6 @@ mp_phrase = cem_mp_phrase | to_give_mp_phrase | obtained_mp_phrase - class MpParser(BaseParser): """""" root = mp_phrase diff --git a/chemdataextractor/parse/table.py b/chemdataextractor/parse/table.py index 9d851c8..5e70635 100644 --- a/chemdataextractor/parse/table.py +++ b/chemdataextractor/parse/table.py @@ -19,7 +19,7 @@ from .common import delim from ..utils import first -from ..model import Compound, UvvisSpectrum, UvvisPeak, QuantumYield, FluorescenceLifetime, MeltingPoint +from ..model import Compound, UvvisSpectrum, UvvisPeak, QuantumYield, FluorescenceLifetime, MeltingPoint, GlassTransition from ..model import ElectrochemicalPotential, IrSpectrum, IrPeak from .actions import join, merge, fix_whitespace from .base import BaseParser @@ -201,6 +201,12 @@ def split_uvvis_shape(tokens, start, result): temp_with_optional_units + ZeroOrMore(delims.hide() + temp_with_optional_units) )('melting_point_cell') +glass_transition_title = R('^T(g\.)$', re.I) | W('T') + R('^(g\.)?$') +glass_transition_heading = (glass_transition_title.hide() + delims.hide() + Optional(temp_units))('glass_transition_heading') +glass_transition_cell = ( + temp_with_optional_units + ZeroOrMore(delims.hide() + temp_with_optional_units) +)('glass_transition_cell') + caption_context = Group(subject_phrase | solvent_phrase | temp_phrase)('caption_context') @@ -241,11 +247,12 @@ def interpret(self, result, start, end): solvent = first(result.xpath('./name/text()')) if solvent is not None: context = {'solvent': solvent} - c.melting_points = [MeltingPoint(context)] - c.quantum_yields = [QuantumYield(context)] - c.fluorescence_lifetimes = [FluorescenceLifetime(context)] - c.electrochemical_potentials = [ElectrochemicalPotential(context)] - c.uvvis_spectra = [UvvisSpectrum(context)] + c.melting_points = [MeltingPoint(**context)] + c.glass_transitions = [GlassTransition(**context)] + c.quantum_yields = [QuantumYield(**context)] + c.fluorescence_lifetimes = [FluorescenceLifetime(**context)] + c.electrochemical_potentials = [ElectrochemicalPotential(**context)] + c.uvvis_spectra = [UvvisSpectrum(**context)] if c.serialize(): yield c @@ -261,10 +268,11 @@ def interpret(self, result, start, end): 'temperature': first(result.xpath('./value/text()')), 'temperature_units': first(result.xpath('./units/text()')) } - c.quantum_yields = [QuantumYield(context)] - c.fluorescence_lifetimes = [FluorescenceLifetime(context)] - c.electrochemical_potentials = [ElectrochemicalPotential(context)] - c.uvvis_spectra = [UvvisSpectrum(context)] + # RBT Same problem as before missing **? + c.quantum_yields = [QuantumYield(**context)] + c.fluorescence_lifetimes = [FluorescenceLifetime(**context)] + c.electrochemical_potentials = [ElectrochemicalPotential(**context)] + c.uvvis_spectra = [UvvisSpectrum(**context)] yield c @@ -278,11 +286,13 @@ def interpret(self, result, start, end): solvent = first(result.xpath('./name/text()')) if solvent is not None: context = {'solvent': solvent} - c.melting_points = [MeltingPoint(context)] - c.quantum_yields = [QuantumYield(context)] - c.fluorescence_lifetimes = [FluorescenceLifetime(context)] - c.electrochemical_potentials = [ElectrochemicalPotential(context)] - c.uvvis_spectra = [UvvisSpectrum(context)] + # RBT (Added ** to context) + c.melting_points = [MeltingPoint(**context)] + c.glass_transitions = [GlassTransition(**context)] + c.quantum_yields = [QuantumYield(**context)] + c.fluorescence_lifetimes = [FluorescenceLifetime(**context)] + c.electrochemical_potentials = [ElectrochemicalPotential(**context)] + c.uvvis_spectra = [UvvisSpectrum(**context)] if c.serialize(): yield c @@ -597,6 +607,37 @@ def interpret(self, result, start, end): yield c +class GlassTransitionHeadingParser(BaseParser): + """""" + root = glass_transition_heading + + def interpret(self, result, start, end): + """""" + glass_transition_units = first(result.xpath('./units/text()')) + c = Compound() + if glass_transition_units: + c.glass_transitions.append( + GlassTransition(units=glass_transition_units) + ) + yield c + +class GlassTransitionCellParser(BaseParser): + """""" + root = glass_transition_cell + + def interpret(self, result, start, end): + """""" + c = Compound() + for tg in result.xpath('./temp'): + c.glass_transitions.append( + GlassTransition( + value=first(mp.xpath('./value/text()')), + units=first(mp.xpath('./units/text()')) + ) + ) + if c.glass_transition: + yield c + class ElectrochemicalPotentialHeadingParser(BaseParser): """""" root = electrochemical_potential_heading @@ -650,6 +691,13 @@ def interpret(self, result, start, end): if context: c.melting_points = [MeltingPoint(**context)] temp = first(result.xpath('./temp_phrase')) + if temp is not None: + context['temperature'] = first(temp.xpath('./temp/value/text()')) + context['temperature_units'] = first(temp.xpath('./temp/units/text()')) + # Glass transition temperature shouldn't have contextual temperature + if context: + c.glass_transitions = [GlassTransition(**context)] + temp = first(result.xpath('./temp_phrase')) if temp is not None: context['temperature'] = first(temp.xpath('./temp/value/text()')) context['temperature_units'] = first(temp.xpath('./temp/units/text()')) diff --git a/chemdataextractor/parse/tg.py b/chemdataextractor/parse/tg.py new file mode 100644 index 0000000..beaa45f --- /dev/null +++ b/chemdataextractor/parse/tg.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +""" +chemdataextractor.parse.nmr +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +NMR text parser. + +:copyright: Copyright 2016 by Matt Swain. +:license: MIT, see LICENSE file for more details. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import logging +import re + +from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name +from chemdataextractor.parse.common import lbrct, dt, rbrct, hyphen +from ..utils import first +from ..model import Compound, GlassTransition +from .actions import merge, join +from .base import BaseParser +from .elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore + +log = logging.getLogger(__name__) + +prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Tg') + Optional(rbrct) | I('glass') + Optional(I('transition')) + Optional((I('temperature') | I('range') | I('temp.'))) | W('transition') + Optional((I('temperature') | I('range') | I('temp.')))).hide() + Optional(lbrct + W('Tg') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('ca') | I('ca.')).hide() +#prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Tg') + Optional(rbrct) | I('glass') + Optional(I('transition')) + Optional((I('temperature') | I('range') | I('temp.')))).hide() + Optional(lbrct + W('Tg') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('ca') | I('ca.')).hide() + +delim = R('^[:;\.,]$') + +# TODO: Consider allowing degree symbol to be optional. The prefix should be restrictive enough to stop false positives. +units = (W('°') + Optional(R('^[CFK]\.?$')) | W('K\.?'))('units').add_action(merge) + +joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge) +spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge) +to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join) +temp_range = (Optional(R('^[\-–−]$')) + (joined_range | spaced_range | to_range))('value').add_action(merge) +temp_value = (Optional(R('^[~∼˜\<\>]$')) + Optional(R('^[\-–−]$')) + R('^[\+\-–−]?\d+(\.\d+)?$'))('value').add_action(merge) +temp = Optional(lbrct).hide() + (temp_range | temp_value)('value') + Optional(rbrct).hide() + +tg = (prefix + Optional(delim).hide() + temp + units)('tg') + +bracket_any = lbrct + OneOrMore(Not(tg) + Not(rbrct) + Any()) + rbrct + +cem_tg_phrase = (Optional(cem) + Optional(I('having')).hide() + Optional(delim).hide() + Optional(bracket_any).hide() + Optional(delim).hide() + Optional(lbrct) + tg + Optional(rbrct))('tg_phrase') + +obtained_tg_phrase = ((cem | chemical_label) + (I('is') | I('are') | I('was')).hide() + (I('measured') | I('obtained') | I('yielded')).hide() + ZeroOrMore(Not(tg) + Not(cem) + Any()).hide() + tg)('tg_phrase') + +#tg_phrase = cem_tg_phrase | method1_phrase | method2_phrase | method3_phrase | obtained_tg_phrase +tg_phrase = cem_tg_phrase | obtained_tg_phrase + + +class TgParser(BaseParser): + """""" + root = tg_phrase + + #print ('outside parser', tg_phrase, type(tg_phrase)) + + def interpret(self, result, start, end): + compound = Compound( + glass_transitions=[ + GlassTransition( + value=first(result.xpath('./tg/value/text()')), + units=first(result.xpath('./tg/units/text()')) + ) + ] + ) + cem_el = first(result.xpath('./cem')) + if cem_el is not None: + compound.names = cem_el.xpath('./name/text()') + compound.labels = cem_el.xpath('./label/text()') + yield compound + diff --git a/tests/test_parse_mp.py b/tests/test_parse_mp.py index 99972db..0ddaba1 100644 --- a/tests/test_parse_mp.py +++ b/tests/test_parse_mp.py @@ -203,6 +203,11 @@ def test_colon(self): expected = '