Skip to content

Commit

Permalink
Merge pull request #7 from scieloorg/model_adequacy
Browse files Browse the repository at this point in the history
  • Loading branch information
gitnnolabs authored Jul 21, 2022
2 parents b05e820 + 7be2ba3 commit 93ca75d
Show file tree
Hide file tree
Showing 7 changed files with 342 additions and 82 deletions.
67 changes: 38 additions & 29 deletions scholarly_articles/choices.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,41 @@
TYPE_OF_RESOURCE = [
("", ""),
("Book Section", "book-section"),
("Monograph", "monograph"),
("Report", "report"),
("Peer Review", "peer-review"),
("Book Track", "book-track"),
("Journal Article", "journal-article"),
("Part", "book-part"),
("Other", "other"),
("Book", "book"),
("Journal Volume", "journal-volume"),
("Book Set", "book-set"),
("Reference Entry", "reference-entry"),
("Proceedings Article", "proceedings-article"),
("Journal", "journal"),
("Component", "component"),
("Book Chapter", "book-chapter"),
("Proceedings Series", "proceedings-series"),
("Report Series", "report-series"),
("Proceedings", "proceedings"),
("Standard", "standard"),
("Reference Book", "reference-book"),
("Posted Content", "posted-content"),
("Journal Issue", "journal-issue"),
("Dissertation", "dissertation"),
("Grant", "grant"),
("Dataset", "dataset"),
("Book Series", "book-series"),
("Edited Book", "edited-book"),
("Standard Series", "standard-series")
("book-section", "Book Section"),
("monograph", "Monograph"),
("report", "Report"),
("peer-review", "Peer Review"),
("book-track", "Book Track"),
("journal-article", "Journal Article"),
("book-part", "Part"),
("other", "Other"),
("book", "Book"),
("journal-volume", "Journal Volume"),
("book-set", "Book Set"),
("reference-entry", "Reference Entry"),
("proceedings-article", "Proceedings Article"),
("journal", "Journal"),
("component", "Component"),
("book-chapter", "Book Chapter"),
("proceedings-series", "Proceedings Series"),
("report-series", "Report Series"),
("proceedings", "Proceedings"),
("standard", "Standard"),
("reference-book", "Reference Book"),
("posted-content", "Posted Content"),
("journal-issue", "Journal Issue"),
("dissertation", "Dissertation"),
("grant", "Grant"),
("dataset", "Dataset"),
("book-series", "Book Series"),
("edited-book", "Edited Book"),
("standard-series", "Standard Series")
]

OA_STATUS = [
("", ""),
("Gold", "gold"),
("Hybrid", "hybrid"),
("Bronze", "bronze"),
("Green", "green"),
("Closed", "closed")
]
115 changes: 86 additions & 29 deletions scholarly_articles/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,53 +4,110 @@
from wagtail.admin.edit_handlers import FieldPanel

from . import choices
from core.models import CommonControlField


class ScholarlyArticles(models.Model):
doi = models.CharField("DOI", max_length=255, null=False, blank=False)
doi_url = models.URLField("DOI URL", max_length=255, null=True, blank=True)
genre = models.CharField("Resource Type", max_length=255, choices=choices.TYPE_OF_RESOURCE, null=False, blank=False)
is_oa = models.BooleanField("Opens Access", max_length=255, null=True, blank=True)
journal_is_in_doaj = models.BooleanField("DOAJ", max_length=255, null=True, blank=True)
journal_issns = models.CharField("ISSN's", max_length=255, null=False, blank=False)
journal_issn_l = models.CharField("ISSN-L", max_length=255, null=False, blank=False)
journal_name = models.CharField("Journal Name", max_length=255, null=True, blank=True)
published_date = models.DateTimeField("Published Date", max_length=255, null=True, blank=True)
publisher = models.CharField("Publisher", max_length=255, null=True, blank=True)
title = models.CharField("Title", max_length=255, null=True, blank=True)
article_json = models.JSONField("JSON File", null=True, blank=True)
doi = models.CharField(_("DOI"), max_length=255, null=True, blank=True)
year = models.CharField(_("Year"), max_length=4, null=True, blank=True)
contributors = models.ManyToManyField(_("Contributors"), null=True, blank=True)
journal = models.ForeignKey('Journals', on_delete=models.SET_NULL, max_length=255, null=True, blank=True)

def __unicode__(self):
return self.doi

def __str__(self):
return self.doi

panels = [
FieldPanel('doi'),
FieldPanel('doi_url'),
FieldPanel('genre'),
FieldPanel('is_oa'),
FieldPanel('journal_is_in_doaj'),
FieldPanel('journal_issns'),
FieldPanel('year'),
FieldPanel('contributors'),
FieldPanel('journal'),
]


class Journals(models.Model):
journal_issn_l = models.CharField(_("ISSN-L"), max_length=255, null=True, blank=True)
journal_issns = models.CharField(_("ISSN's"), max_length=255, null=True, blank=True)
journal_name = models.CharField(_("Journal Name"), max_length=255, null=True, blank=True)
publisher = models.CharField(_("Publisher"), max_length=255, null=True, blank=True)
journal_is_in_doaj = models.BooleanField(_("DOAJ"), max_length=255, default=False, null=True, blank=True)

def __unicode__(self):
return self.journal_issn_l

def __str__(self):
return self.journal_issn_l

panels = [
FieldPanel('journal_issn_l'),
FieldPanel('journal_issns'),
FieldPanel('journal_name'),
FieldPanel('published_date'),
FieldPanel('publisher'),
FieldPanel('title'),
FieldPanel('article_json'),
FieldPanel('journal_is_in_doaj'),
]


class Contributors(models.Model):
doi = models.CharField("DOI", max_length=255, null=False, blank=False)
doi_url = models.URLField("DOI URL", max_length=255, null=True, blank=True)
family = models.CharField("Family", max_length=255, null=False, blank=False)
given = models.CharField("Given", max_length=255, null=False, blank=False)
orcid = models.URLField("ORCID", max_length=255, null=False, blank=False)
authenticated_orcid = models.BooleanField("Authenticated", max_length=255, null=False, blank=False)
affiliation = models.CharField("Affiliation", max_length=255, null=False, blank=False)
family = models.CharField(_("Family Name"), max_length=255, null=True, blank=True)
given = models.CharField(_("Given Name"), max_length=255, null=True, blank=True)
orcid = models.CharField("ORCID", max_length=255, null=True, blank=True)
authenticated_orcid = models.BooleanField(_("Authenticated"), default=False, null=True, blank=True)
affiliation = models.ForeignKey(_("Affiliations"), on_delete=models.SET_NULL, max_length=255, null=True, blank=True)

def __unicode__(self):
return f"{self.family}, {self.given} ({self.orcid})"

def __str__(self):
return f"{self.family}, {self.given} ({self.orcid})"

panels = [
FieldPanel('doi'),
FieldPanel('doi_url'),
FieldPanel('family'),
FieldPanel('given'),
FieldPanel('orcid'),
FieldPanel('authenticated_orcid'),
FieldPanel('affiliation'),
]


class Affiliations(models.Model):
name = models.CharField(_("Affiliation Name"), max_length=255, null=True, blank=True)

def __unicode__(self):
return self.name

def __str__(self):
return self.name

panels = [
FieldPanel('name'),
]


class RawUnpaywall(models.Model):
doi = models.CharField(_("DOI"), max_length=255, null=False, blank=False)
harvesting_creation = models.CharField(_("Harvesting date"), max_length=255, null=False, blank=False)
is_paratext = models.BooleanField(_("Paratext"), default=False, null=True, blank=True)
year = models.CharField(_("Year"), max_length=255, null=True, blank=True)
# unpaywall genre
resource_type = models.CharField(_("Resource Type"), max_length=255, choices=choices.TYPE_OF_RESOURCE, null=False,
blank=True)
update = models.CharField(_("Update"), max_length=255, null=True, blank=True)
json = models.JSONField(_("JSON File"), null=True, blank=True)

def __unicode__(self):
return self.doi

def __str__(self):
return self.doi

panels = [
FieldPanel('doi'),
FieldPanel('harvesting_creation'),
FieldPanel('is_paratext'),
FieldPanel('year'),
FieldPanel('resource_type'),
FieldPanel('update'),
FieldPanel('json'),
]
Empty file.
3 changes: 3 additions & 0 deletions scholarly_articles/scripts/examples.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"doi": "10.1002/psp.2296", "year": 2020, "genre": "journal-article", "is_oa": false, "title": "Trap or opportunity—What role does geography play in the use of cash for childcare?", "doi_url": "https://doi.org/10.1002/psp.2296", "updated": "2021-04-02T00:47:21.884997", "oa_status": "closed", "publisher": "Wiley", "z_authors": [{"ORCID": "http://orcid.org/0000-0002-4877-2961", "given": "Lena", "family": "Magnusson Turner", "sequence": "first", "affiliation": [{"name": "Norwegian Social Research Oslo Metropolitan University Oslo Norway"}], "authenticated-orcid": false}, {"ORCID": "http://orcid.org/0000-0002-4536-9229", "given": "John", "family": "Östh", "sequence": "additional", "affiliation": [{"name": "Department of Social and Economic Geography Uppsala University Uppsala Sweden"}], "authenticated-orcid": false}], "is_paratext": true, "journal_name": "Population, Space and Place", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "1544-8444,1544-8452", "journal_issn_l": "1544-8444", "published_date": "2020-01-07", "best_oa_location": null, "first_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false, "oa_locations_embargoed": []}
{"doi": "10.1093/dote/doab014", "year": 2021, "genre": "journal-article", "is_oa": false, "title": "Population trends in achalasia diagnosis and management: a changing paradigm", "doi_url": "https://doi.org/10.1093/dote/doab014", "updated": "2021-03-17T21:05:16.450507", "oa_status": "closed", "publisher": "Oxford University Press (OUP)", "z_authors": [{"given": "Judy A", "family": "Trieu", "sequence": "first", "affiliation": [{"name": "Division of Gastroenterology and Nutrition, Loyola University Medical Center, Maywood, IL, USA"}]}, {"ORCID": "http://orcid.org/0000-0002-1870-9881", "given": "Arshish", "family": "Dua", "sequence": "additional", "affiliation": [{"name": "Division of Gastroenterology and Nutrition, Loyola University Medical Center, Maywood, IL, USA"}], "authenticated-orcid": false}, {"given": "Ikponmwosa", "family": "Enofe", "sequence": "additional", "affiliation": [{"name": "Division of Gastroenterology and Nutrition, Loyola University Medical Center, Maywood, IL, USA"}]}, {"given": "Nikhil", "family": "Shastri", "sequence": "additional", "affiliation": [{"name": "Division of Gastroenterology and Nutrition, Loyola University Medical Center, Maywood, IL, USA"}]}, {"given": "Mukund", "family": "Venu", "sequence": "additional", "affiliation": [{"name": "Division of Gastroenterology and Nutrition, Loyola University Medical Center, Maywood, IL, USA"}]}], "is_paratext": false, "journal_name": "Diseases of the Esophagus", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "1120-8694,1442-2050", "journal_issn_l": "1120-8694", "published_date": "2021-03-17", "best_oa_location": null, "first_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false, "oa_locations_embargoed": []}
{"doi": "10.1080/01430750.2019.1708791", "year": 2020, "genre": "journal-article", "is_oa": false, "title": "Techno-economic study of hybrid renewable energy system of Metropolitan Cities in India", "doi_url": "https://doi.org/10.1080/01430750.2019.1708791", "updated": "2021-03-30T01:23:35.505209", "oa_status": "closed", "publisher": "Informa UK Limited", "z_authors": [{"ORCID": "http://orcid.org/0000-0001-6059-6326", "given": "Balachander", "family": "Kalappan", "sequence": "first", "affiliation": [{"name": "Department of Electrical and Electronics Engineering, Faculty of Engineering, Karpagam Academy of Higher Education, Coimbatore, India"}], "authenticated-orcid": false}, {"ORCID": "http://orcid.org/0000-0001-5138-716X", "given": "A.", "family": "Amudha", "sequence": "additional", "affiliation": [{"name": "Department of Electrical and Electronics Engineering, Faculty of Engineering, Karpagam Academy of Higher Education, Coimbatore, India"}], "authenticated-orcid": false}, {"ORCID": "http://orcid.org/0000-0003-1370-1140", "given": "K.", "family": "Keerthivasan", "sequence": "additional", "affiliation": [{"name": "Higher College of Technology, Muscat, Oman"}], "authenticated-orcid": false}], "is_paratext": false, "journal_name": "International Journal of Ambient Energy", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0143-0750,2162-8246", "journal_issn_l": "0143-0750", "published_date": "2020-01-06", "best_oa_location": null, "first_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false, "oa_locations_embargoed": []}
100 changes: 100 additions & 0 deletions scholarly_articles/scripts/load_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from scholarly_articles import models


def get_params(row, attribs):
params = {}
for att in attribs:
if row.get(att):
params[att] = row.get(att)
return params


def load_article(row):
articles = models.ScholarlyArticles.objects.filter(doi=row.get('doi'))
try:
article = articles[0]
except IndexError:
article = models.ScholarlyArticles()
article.doi = row.get('doi')
article.year = row.get('year')
article.journal = load_journal(row)
article.save()
for author in row['z_authors']:
contributor = get_one_contributor(author)
article.contributors.add(contributor)
article.save()
return article


def load_journal(row):
attribs = ['journal_issns', 'journal_issn_l', 'journal_name']
params = get_params(row, attribs)

journals = models.Journals.objects.filter(**params)
try:
journal = journals[0]
except IndexError:
journal = models.Journals()
journal.journal_is_in_doaj = row.get('journal_is_in_doaj')
journal.journal_issns = row.get('journal_issns')
journal.journal_issn_l = row.get('journal_issn_l')
journal.journal_name = row.get('journal_name')
journal.publisher = row.get('publisher')
journal.save()
return journal


def get_one_contributor(author):
attribs = ['family', 'given']
params = get_params(author, attribs)
if author.get('ORCID'):
params['orcid'] = author.get('ORCID')
elif author.get('affiliation'):
try:
aff = models.Affiliations.objects.filter(name=author.get('affiliation')[0].get('name'))
params['affiliation'] = aff[0]
except IndexError:
pass

contributors = models.Contributors.objects.filter(**params)
try:
contributor = contributors[0]
except IndexError:
contributor = models.Contributors()
contributor.family = author.get('family')
contributor.given = author.get('given')
contributor.orcid = author.get('ORCID')
contributor.authenticated_orcid = author.get('authenticated-orcid')
if author.get('affiliation'):
try:
aff = load_affiliation(author['affiliation'][0]['name'])
contributor.affiliation = aff
except KeyError:
pass
contributor.save()
return contributor


def load_affiliation(affiliation_name):
if affiliation_name:
affiliations = models.Affiliations.objects.filter(name=affiliation_name)
try:
affiliation = affiliations[0]
except IndexError:
affiliation = models.Affiliations()
if affiliation_name:
affiliation.name = affiliation_name
affiliation.save()
return affiliation


def run(from_year=1900, resource_type='journal-article'):
#pagination
rawunpaywall = models.RawUnpaywall.objects.filter(year__gte=from_year, resource_type=resource_type)
for item in rawunpaywall:
if not item.is_paratext:
load_article(item.json)


if __name__ == '__main__':
run()
34 changes: 34 additions & 0 deletions scholarly_articles/scripts/load_raw_unpaywall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from scholarly_articles import models

import json
from datetime import date


def load(row):
try:
if row.get('doi'):
rawunpaywall = models.RawUnpaywall.objects.filter(doi=row['doi'])
if len(rawunpaywall) == 0:
rawunpaywall = models.RawUnpaywall()
rawunpaywall.doi = row['doi']
rawunpaywall.harvesting_creation = date.today()
else:
return

rawunpaywall.is_paratext = row.get('is_paratext')
rawunpaywall.year = row.get('year')
rawunpaywall.resource_type = row.get('genre')
try:
rawunpaywall.update = row.get('updated')[:10]
except TypeError:
pass
rawunpaywall.json = row
rawunpaywall.save()
except KeyError:
pass


def run():
data = (list(json.loads(x) for x in open('scholarly_articles/scripts/examples.json')))
for row in data:
load(row)
Loading

0 comments on commit 93ca75d

Please sign in to comment.