Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parser for glass transition temperature #13

Merged
merged 1 commit into from
Feb 2, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion chemdataextractor/doc/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
ExtinctionHeadingParser, FluorescenceLifetimeHeadingParser, FluorescenceLifetimeCellParser, \
ElectrochemicalPotentialHeadingParser, ElectrochemicalPotentialCellParser, IrHeadingParser, IrCellParser, \
SolventCellParser, SolventHeadingParser, SolventInHeadingParser, UvvisAbsEmiQuantumYieldHeadingParser, \
UvvisAbsEmiQuantumYieldCellParser, MeltingPointHeadingParser, MeltingPointCellParser, TempInHeadingParser, \
UvvisAbsEmiQuantumYieldCellParser, MeltingPointHeadingParser, MeltingPointCellParser, GlassTransitionHeadingParser, GlassTransitionCellParser, TempInHeadingParser, \
UvvisAbsDisallowedHeadingParser, UvvisEmiQuantumYieldHeadingParser, UvvisEmiQuantumYieldCellParser
# TODO: Sort out the above import... import module instead
from ..nlp.tag import NoneTagger
Expand Down Expand Up @@ -50,6 +50,7 @@ class Table(CaptionedElement):
(FluorescenceLifetimeHeadingParser(), FluorescenceLifetimeCellParser()),
(ElectrochemicalPotentialHeadingParser(), ElectrochemicalPotentialCellParser()),
(MeltingPointHeadingParser(), MeltingPointCellParser()),
(GlassTransitionHeadingParser(), GlassTransitionCellParser()),
(SolventHeadingParser(), SolventCellParser()),
(SolventInHeadingParser(),),
(TempInHeadingParser(),)
Expand Down
3 changes: 2 additions & 1 deletion chemdataextractor/doc/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ..parse.table import CaptionContextParser
from ..parse.ir import IrParser
from ..parse.mp import MpParser
from ..parse.tg import TgParser
from ..parse.nmr import NmrParser
from ..parse.uvvis import UvvisParser
from ..nlp.lexicon import ChemLexicon
Expand Down Expand Up @@ -267,7 +268,7 @@ def _repr_html_(self):

class Paragraph(Text):

parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), ContextParser()]
parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), ContextParser()]

def _repr_html_(self):
return '<p class="cde-paragraph">' + self.text + '</p>'
Expand Down
12 changes: 12 additions & 0 deletions chemdataextractor/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,14 @@ class MeltingPoint(BaseModel):
apparatus = StringType(contextual=True)


class GlassTransition(BaseModel):
"""A glass transition temperature."""
value = StringType()
units = StringType(contextual=True)
method = StringType(contextual=True)
concentration = StringType(contextual=True)
concentration_units = StringType(contextual=True)

class QuantumYield(BaseModel):
"""A quantum yield measurement."""
value = StringType()
Expand Down Expand Up @@ -437,6 +445,7 @@ class Compound(BaseModel):
ir_spectra = ListType(ModelType(IrSpectrum))
uvvis_spectra = ListType(ModelType(UvvisSpectrum))
melting_points = ListType(ModelType(MeltingPoint))
glass_transitions = ListType(ModelType(GlassTransition))
quantum_yields = ListType(ModelType(QuantumYield))
fluorescence_lifetimes = ListType(ModelType(FluorescenceLifetime))
electrochemical_potentials = ListType(ModelType(ElectrochemicalPotential))
Expand All @@ -460,6 +469,9 @@ def merge_contextual(self, other):
for item in self[k]:
# print('item: %s' % item)
for other_item in other.get(k, []):
#RBT Problem with doi: ma301230y. Had to add following check
if (isinstance(other_item,unicode) == True):
continue
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems this problem occurs when it is attempted to merge the 'contextual' information from a compound record into another one, but the 'contextual' record also happens to contain a name/label/role. This can happen when merging in information from table footnotes or caption - the idea is to merge e.g. a solvent into the property in the table cell that references it. But sometimes there is an actual compound named in the footnote. This fix seems to work well - just ignoring the string-based properties - which are name/label/role. However, unicode is python 2 only - this needs to be six.text_type for python 2 and 3 compatibility. In future, this whole merge_contextual method needs refactoring to be more agnostic of the model schema.

# if k in {'names', 'labels'}:
# # TODO: Warn attempting to merge a contextual other that contains names/labels
# continue
Expand Down
3 changes: 2 additions & 1 deletion chemdataextractor/parse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,14 @@
from .context import ContextParser
from .ir import IrParser
from .mp import MpParser
from .tg import TgParser
from .nmr import NmrParser
from .table import CompoundHeadingParser, SolventHeadingParser, UvvisAbsDisallowedHeadingParser, SolventInHeadingParser
from .table import TempInHeadingParser, SolventCellParser, CompoundCellParser, UvvisEmiHeadingParser
from .table import UvvisAbsHeadingParser, ExtinctionHeadingParser, IrHeadingParser, IrCellParser
from .table import QuantumYieldHeadingParser, QuantumYieldCellParser, UvvisEmiCellParser, UvvisAbsCellParser
from .table import ExtinctionCellParser, UvvisAbsEmiQuantumYieldHeadingParser, UvvisAbsEmiQuantumYieldCellParser
from .table import UvvisEmiQuantumYieldHeadingParser, UvvisEmiQuantumYieldCellParser, FluorescenceLifetimeHeadingParser
from .table import FluorescenceLifetimeCellParser, MeltingPointHeadingParser, MeltingPointCellParser
from .table import FluorescenceLifetimeCellParser, MeltingPointHeadingParser, MeltingPointCellParser, GlassTransitionHeadingParser, GlassTransitionCellParser
from .table import ElectrochemicalPotentialHeadingParser, ElectrochemicalPotentialCellParser, CaptionContextParser
from .uvvis import UvvisParser
15 changes: 11 additions & 4 deletions chemdataextractor/parse/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .common import optdelim, hyphen, slash
from ..utils import first
from ..parse.base import BaseParser
from ..model import Compound, QuantumYield, NmrSpectrum, UvvisSpectrum, IrSpectrum, MeltingPoint, FluorescenceLifetime
from ..model import Compound, QuantumYield, NmrSpectrum, UvvisSpectrum, IrSpectrum, MeltingPoint, GlassTransition, FluorescenceLifetime
from .actions import join, merge, fix_whitespace
from .cem import chemical_name
from .elements import I, T, R, W, ZeroOrMore, Optional, Group, OneOrMore, Any, Not
Expand All @@ -33,13 +33,17 @@
uvvis = (I('UV') + (hyphen | slash) + R('^vis(ible)?$', re.I) + Optional(R('^abs(or[bp]tion)?$')))('uvvis').add_action(join).add_action(fix_whitespace)
ir = (R('^(FT-?)?IR|FT-?IS$'))('ir').add_action(join)
mp = (I('melting') + I('points'))('melting_point').add_action(join)
tg = (I('glass') + I('transition') + I('temperature'))('glass_transition').add_action(join)
pp = (I('photophysical') + (I('measurements') | I('properties')))('photophysical_properties').add_action(join)
measurement = Group(quantum_yield | nmr | uvvis | ir | mp | pp)('measurement')
measurement = Group(quantum_yield | nmr | uvvis | ir | mp | tg | pp)('measurement')

result_noun = I('data') | I('results') | I('experiments') | I('spectra')

verb = W('measured') | W('recorded') | W('collected') | W('taken') | W('acquired') | W('obtained') | W('run') | (W('carried') + W('out') | W('performed')) # | T('VBN')


method = I('DSC') | I('TMA') | I('DTA') + I('RTL') | I('DMA') + I('DMTA') | I('dilatometer') | I('dilatometry') | I('PALS')

apparatus_type = R('^\d{2,}$') + W('MHz')
brands = I('HORIBA') + I('Jobin') + I('Yvon') | I('Hitachi') | I('Bruker') | I('Cary') | I('Jeol') | I('PerkinElmer') | I('Agilent') | I('Shimadzu') | I('Varian')
models = I('FluoroMax-4') | I('F-7000') | I('AVANCE') | I('Digital') | R('\d\d\d+') | I('UV–vis-NIR') | I('Mercury') | I('Avatar') | I('thermonicolet') | I('pulsed') | I('Fourier') | I('transform')
Expand All @@ -60,7 +64,7 @@
standard = (ZeroOrMore(T('JJ')) + OneOrMore(T('NNP') | T('NN') | T('HYPH') | T('CD') | T('B-CM') | T('I-CM')))('standard').add_action(join).add_action(fix_whitespace)
standard_phrase = (W('with') | W('using')).hide() + Optional(dt).hide() + standard + (ZeroOrMore(W('as') | dt) + Optional(T('JJ')) + I('standard')).hide()

context_phrase = Group(measurement + optdelim + Optional(result_noun).hide() + Optional(T('VBD')).hide() + ZeroOrMore(Not(verb) + Any()).hide() + verb.hide() + OneOrMore(standard_phrase | apparatus_phrase | temperature_phrase | solvent_phrase | Any().hide()))('context_phrase')
context_phrase = Group(measurement + optdelim + Optional(result_noun).hide() + Optional(T('VBD')).hide() + ZeroOrMore(Not(verb) + Any()).hide() + verb.hide() + OneOrMore(standard_phrase | apparatus_phrase | temperature_phrase | solvent_phrase | method | Any().hide()))('context_phrase')

# TODO: Multiple measurements, multiple apparatus.
# TODO: 'respectively' phrase
Expand All @@ -81,10 +85,11 @@ def interpret(self, result, start, end):
}
measurement = result.xpath('./measurement/*[1]')[0]

if not measurement.tag == 'melting_point':
if not measurement.tag == 'melting_point' and not measurement.tag =='glass_transition':
context['temperature'] = first(result.xpath('./temperature/value/text()'))
context['temperature_units'] = first(result.xpath('./temperature/units/text()'))


if measurement.tag == 'photophysical_properties':
c.quantum_yields.append(QuantumYield(**context))
c.fluorescence_lifetimes.append(FluorescenceLifetime(**context))
Expand All @@ -93,6 +98,8 @@ def interpret(self, result, start, end):
c.quantum_yields.append(QuantumYield(**context))
if measurement.tag == 'melting_point':
c.melting_points.append(MeltingPoint(**context))
if measurement.tag == 'glass_transition':
c.glass_transitions.append(GlassTransition(**context))
if measurement.tag == 'nmr':
c.nmr_spectra.append(NmrSpectrum(**context))
if measurement.tag == 'uvvis':
Expand Down
5 changes: 2 additions & 3 deletions chemdataextractor/parse/mp.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

log = logging.getLogger(__name__)

prefix = Optional(I('a')).hide() + (R('^m\.?pt?\.?$', re.I) | I('melting') + Optional((I('point') | I('temperature')| I('range'))) | R('^m\.?$', re.I) + R('^pt?\.?$', re.I)).hide() + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about')).hide()
prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Tm') + Optional(rbrct)| R('^m\.?pt?\.?$', re.I) | I('melting') + Optional((I('point') | I('temperature')| I('range'))) | R('^m\.?$', re.I) + R('^pt?\.?$', re.I)).hide() + Optional(lbrct + W('Tm') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about')).hide()

delim = R('^[:;\.,]$')

Expand All @@ -38,7 +38,7 @@
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)
temp_range = (Optional(R('^[\-–−]$')) + (joined_range | spaced_range | to_range))('value').add_action(merge)
temp_value = (Optional(R('^[~∼˜\<\>]$')) + Optional(R('^[\-–−]$')) + R('^[\+\-–−]?\d+(\.\d+)?$'))('value').add_action(merge)
temp = (temp_range | temp_value)('value')
temp = Optional(lbrct).hide() + (temp_range | temp_value)('value') + Optional(rbrct).hide()

mp = (prefix + Optional(delim).hide() + temp + units)('mp')

Expand All @@ -52,7 +52,6 @@

mp_phrase = cem_mp_phrase | to_give_mp_phrase | obtained_mp_phrase


class MpParser(BaseParser):
""""""
root = mp_phrase
Expand Down
78 changes: 63 additions & 15 deletions chemdataextractor/parse/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from .common import delim
from ..utils import first
from ..model import Compound, UvvisSpectrum, UvvisPeak, QuantumYield, FluorescenceLifetime, MeltingPoint
from ..model import Compound, UvvisSpectrum, UvvisPeak, QuantumYield, FluorescenceLifetime, MeltingPoint, GlassTransition
from ..model import ElectrochemicalPotential, IrSpectrum, IrPeak
from .actions import join, merge, fix_whitespace
from .base import BaseParser
Expand Down Expand Up @@ -201,6 +201,12 @@ def split_uvvis_shape(tokens, start, result):
temp_with_optional_units + ZeroOrMore(delims.hide() + temp_with_optional_units)
)('melting_point_cell')

glass_transition_title = R('^T(g\.)$', re.I) | W('T') + R('^(g\.)?$')
glass_transition_heading = (glass_transition_title.hide() + delims.hide() + Optional(temp_units))('glass_transition_heading')
glass_transition_cell = (
temp_with_optional_units + ZeroOrMore(delims.hide() + temp_with_optional_units)
)('glass_transition_cell')

caption_context = Group(subject_phrase | solvent_phrase | temp_phrase)('caption_context')


Expand Down Expand Up @@ -241,11 +247,12 @@ def interpret(self, result, start, end):
solvent = first(result.xpath('./name/text()'))
if solvent is not None:
context = {'solvent': solvent}
c.melting_points = [MeltingPoint(context)]
c.quantum_yields = [QuantumYield(context)]
c.fluorescence_lifetimes = [FluorescenceLifetime(context)]
c.electrochemical_potentials = [ElectrochemicalPotential(context)]
c.uvvis_spectra = [UvvisSpectrum(context)]
c.melting_points = [MeltingPoint(**context)]
c.glass_transitions = [GlassTransition(**context)]
c.quantum_yields = [QuantumYield(**context)]
c.fluorescence_lifetimes = [FluorescenceLifetime(**context)]
c.electrochemical_potentials = [ElectrochemicalPotential(**context)]
c.uvvis_spectra = [UvvisSpectrum(**context)]
if c.serialize():
yield c

Expand All @@ -261,10 +268,11 @@ def interpret(self, result, start, end):
'temperature': first(result.xpath('./value/text()')),
'temperature_units': first(result.xpath('./units/text()'))
}
c.quantum_yields = [QuantumYield(context)]
c.fluorescence_lifetimes = [FluorescenceLifetime(context)]
c.electrochemical_potentials = [ElectrochemicalPotential(context)]
c.uvvis_spectra = [UvvisSpectrum(context)]
# RBT Same problem as before missing **?
c.quantum_yields = [QuantumYield(**context)]
c.fluorescence_lifetimes = [FluorescenceLifetime(**context)]
c.electrochemical_potentials = [ElectrochemicalPotential(**context)]
c.uvvis_spectra = [UvvisSpectrum(**context)]
yield c


Expand All @@ -278,11 +286,13 @@ def interpret(self, result, start, end):
solvent = first(result.xpath('./name/text()'))
if solvent is not None:
context = {'solvent': solvent}
c.melting_points = [MeltingPoint(context)]
c.quantum_yields = [QuantumYield(context)]
c.fluorescence_lifetimes = [FluorescenceLifetime(context)]
c.electrochemical_potentials = [ElectrochemicalPotential(context)]
c.uvvis_spectra = [UvvisSpectrum(context)]
# RBT (Added ** to context)
c.melting_points = [MeltingPoint(**context)]
c.glass_transitions = [GlassTransition(**context)]
c.quantum_yields = [QuantumYield(**context)]
c.fluorescence_lifetimes = [FluorescenceLifetime(**context)]
c.electrochemical_potentials = [ElectrochemicalPotential(**context)]
c.uvvis_spectra = [UvvisSpectrum(**context)]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch on the missing ** - I forgot to update this when I changed the model objects to accept keyword arguments rather than a single dictionary. The ** unpacks the context dict to be the equivalent of kwargs like solvent='acetonitrile etc.

if c.serialize():
yield c

Expand Down Expand Up @@ -597,6 +607,37 @@ def interpret(self, result, start, end):
yield c


class GlassTransitionHeadingParser(BaseParser):
""""""
root = glass_transition_heading

def interpret(self, result, start, end):
""""""
glass_transition_units = first(result.xpath('./units/text()'))
c = Compound()
if glass_transition_units:
c.glass_transitions.append(
GlassTransition(units=glass_transition_units)
)
yield c

class GlassTransitionCellParser(BaseParser):
""""""
root = glass_transition_cell

def interpret(self, result, start, end):
""""""
c = Compound()
for tg in result.xpath('./temp'):
c.glass_transitions.append(
GlassTransition(
value=first(mp.xpath('./value/text()')),
units=first(mp.xpath('./units/text()'))
)
)
if c.glass_transition:
yield c

class ElectrochemicalPotentialHeadingParser(BaseParser):
""""""
root = electrochemical_potential_heading
Expand Down Expand Up @@ -650,6 +691,13 @@ def interpret(self, result, start, end):
if context:
c.melting_points = [MeltingPoint(**context)]
temp = first(result.xpath('./temp_phrase'))
if temp is not None:
context['temperature'] = first(temp.xpath('./temp/value/text()'))
context['temperature_units'] = first(temp.xpath('./temp/units/text()'))
# Glass transition temperature shouldn't have contextual temperature
if context:
c.glass_transitions = [GlassTransition(**context)]
temp = first(result.xpath('./temp_phrase'))
if temp is not None:
context['temperature'] = first(temp.xpath('./temp/value/text()'))
context['temperature_units'] = first(temp.xpath('./temp/units/text()'))
Expand Down
76 changes: 76 additions & 0 deletions chemdataextractor/parse/tg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
"""
chemdataextractor.parse.nmr
~~~~~~~~~~~~~~~~~~~~~~~~~~~

NMR text parser.

:copyright: Copyright 2016 by Matt Swain.
:license: MIT, see LICENSE file for more details.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.common import lbrct, dt, rbrct, hyphen
from ..utils import first
from ..model import Compound, GlassTransition
from .actions import merge, join
from .base import BaseParser
from .elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

log = logging.getLogger(__name__)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Tg') + Optional(rbrct) | I('glass') + Optional(I('transition')) + Optional((I('temperature') | I('range') | I('temp.'))) | W('transition') + Optional((I('temperature') | I('range') | I('temp.')))).hide() + Optional(lbrct + W('Tg') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('ca') | I('ca.')).hide()
#prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Tg') + Optional(rbrct) | I('glass') + Optional(I('transition')) + Optional((I('temperature') | I('range') | I('temp.')))).hide() + Optional(lbrct + W('Tg') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('ca') | I('ca.')).hide()

delim = R('^[:;\.,]$')

# TODO: Consider allowing degree symbol to be optional. The prefix should be restrictive enough to stop false positives.
units = (W('°') + Optional(R('^[CFK]\.?$')) | W('K\.?'))('units').add_action(merge)

joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)
temp_range = (Optional(R('^[\-–−]$')) + (joined_range | spaced_range | to_range))('value').add_action(merge)
temp_value = (Optional(R('^[~∼˜\<\>]$')) + Optional(R('^[\-–−]$')) + R('^[\+\-–−]?\d+(\.\d+)?$'))('value').add_action(merge)
temp = Optional(lbrct).hide() + (temp_range | temp_value)('value') + Optional(rbrct).hide()

tg = (prefix + Optional(delim).hide() + temp + units)('tg')

bracket_any = lbrct + OneOrMore(Not(tg) + Not(rbrct) + Any()) + rbrct

cem_tg_phrase = (Optional(cem) + Optional(I('having')).hide() + Optional(delim).hide() + Optional(bracket_any).hide() + Optional(delim).hide() + Optional(lbrct) + tg + Optional(rbrct))('tg_phrase')

obtained_tg_phrase = ((cem | chemical_label) + (I('is') | I('are') | I('was')).hide() + (I('measured') | I('obtained') | I('yielded')).hide() + ZeroOrMore(Not(tg) + Not(cem) + Any()).hide() + tg)('tg_phrase')

#tg_phrase = cem_tg_phrase | method1_phrase | method2_phrase | method3_phrase | obtained_tg_phrase
tg_phrase = cem_tg_phrase | obtained_tg_phrase


class TgParser(BaseParser):
""""""
root = tg_phrase

#print ('outside parser', tg_phrase, type(tg_phrase))

def interpret(self, result, start, end):
compound = Compound(
glass_transitions=[
GlassTransition(
value=first(result.xpath('./tg/value/text()')),
units=first(result.xpath('./tg/units/text()'))
)
]
)
cem_el = first(result.xpath('./cem'))
if cem_el is not None:
compound.names = cem_el.xpath('./name/text()')
compound.labels = cem_el.xpath('./label/text()')
yield compound

Loading