Skip to content

Commit 9e1f97c

Browse files
authored
Merge pull request #234 from golnazads/master
added JATS export, first pass
2 parents 8acbba1 + 62de892 commit 9e1f97c

File tree

6 files changed

+290
-2
lines changed

6 files changed

+290
-2
lines changed

‎.github/workflows/python_actions.yml‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
- name: Install dependencies
1717
run: |
1818
# python -m pip install --upgrade setuptools pip
19-
python -m pip install --upgrade pip
19+
python -m pip install --upgrade wheel pip
2020
pip install -U -r requirements.txt
2121
pip install -U -r dev-requirements.txt
2222

‎exportsrv/formatter/xmlFormat.py‎

Lines changed: 254 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,31 @@
55
from datetime import datetime
66
from flask import current_app
77
from textwrap import fill
8+
import re
9+
from geotext import GeoText
10+
from csv import reader
811

912
from exportsrv.formatter.format import Format
1013
from exportsrv.utils import get_eprint
1114
from exportsrv.formatter.strftime import strftime
1215

13-
# This class accepts JSON object created by Solr and can reformats it
16+
# This class accepts JSON object created by Solr and can reformat it
1417
# for the XML Export formats we are supporting.
1518
# 1- To get Dublin Core XML use
1619
# dublinXML = XMLFormat(jsonFromSolr).get_dublincore_xml()
1720
# 2- To get Reference XML without Abstract use
1821
# referenceXML = XMLFormat(jsonFromSolr).get_reference_xml()
1922
# 3- To get Reference XML with Abstract use
2023
# referenceXML = XMLFormat(jsonFromSolr).get_reference_xml(True)
24+
# 4- To get JATS XML use
25+
# referenceXML = XMLFormat(jsonFromSolr).get_jats_xml(True)
2126

2227
class XMLFormat(Format):
2328

2429
EXPORT_FORMAT_REF_XML = 'ReferenceXML'
2530
EXPORT_FORMAT_REF_ABS_XML = 'ReferenceAbsXML'
2631
EXPORT_FORMAT_DUBLIN_XML = 'DublinXML'
32+
EXPORT_FORMAT_JATS_XML = 'JATSXML'
2733

2834
EXPORT_SERVICE_RECORDS_SET_XML_REF = [('xmlns', 'http://ads.harvard.edu/schema/abs/1.1/references'),
2935
('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'),
@@ -38,6 +44,18 @@ class XMLFormat(Format):
3844
('xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
3945
('xsi:schemaLocation', 'http://ads.harvard.edu/schema/abs/1.1/dc http://ads.harvard.edu/schema/abs/1.1/dc.xsd')]
4046

47+
EXPORT_SERVICE_RECORDS_SET_XML_JATS = [('xmlns', 'http://ads.harvard.edu/schema/abs/1.1/dc'),
48+
('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'),
49+
('xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
50+
('xsi:schemaLocation', 'http://ads.harvard.edu/schema/abs/1.0/jats http://ads.harvard.edu/schema/abs/1.0/jats.xsd')]
51+
52+
re_year = re.compile(r'([12]+[09]\d\d)')
53+
54+
# partial list of known publishers,
55+
# once this field is populated in solr, rely on solr,
56+
# for now this is only used for JATS format
57+
re_publisher_names = re.compile(r"([A-Z]+[A-Za-z\s\-:]+ University Press|[A-Z]+[A-Za-z\s\-:,']+ Press|Springer- .*|Elsevier|University of [A-Z]+[A-Za-z\s-]+ Press|University of [A-Z]+[A-Za-z\s-]+|[\w-]+,\s*\w+\s*:[\s\w]+|Springer\s+([A-Z]+[A-Za-z\s-]+)+|Springer Nature|Springer, Cham|Springer Fachmedien Wiesbaden GmbH, DE|Springer-Verlag GmbH Deutschland|Springer-Verlag Berlin Heidelberg|Springer-Verlag|Springer|Cambridge, J. Wilson and son, University press|Cambridge, The University press|Oxford university press|Loyola university press|Cambridge, University press|Harvard university press|Cambridge, Eng., The University press|Louisiana state university press|Des Moines, Iowa, University press|Cambridge [Eng.] The University press|Edinburgh University Press|Cambridge University Press|Yale University Press|Brigham Young University Press|University Press of Virginia|Erevan University Press|Artemis Press|Laval University Press|Columbia University Press|Rutgers University Press|University Press of America|Johns Hopkins University Press|Sydney University Press|Yerevan University press|McGill-Queen's University Press|Innsbruck University Press|University of Arizona Press|Atlantis Press|Ilia State University Press|Ziti press|The University of Chicago Press|Princeton University Press|eXamen.press|T rculo Press|Duke University Press|Kyoto University Press|Imperial College Press|Heron Press Ltd|Kyiv University Press|Sole Logistics Press|BrownWalker Press|Joseph Henry Press|National Radio Astronomy Press|SPIE Press|Kyriakidis Press|St. Martin's Press|Huntington Library and University of Washington Press|Microcosm Press|Free Press \(Simon and Schuster\)|Cambridge Univ. Press|Templeton Foundation Press|IEEE Press|Heron Press|AIP Press|Pergamon Press|Boydell Press|Baltic Astronomy 6 and L. Davis Press|West Virginia University Press|ACM Press Books|State University of New York \(SUNY\) Press|Clarendon Press|Universal Academic Press Inc|SPIE Optical Engineering Press|Presses universitaires de France|Ginn Press|ABELexpress|SPC Press|CRC Press|Plenum Press|Pedagogical Univ. Press|The MIT \(Massachusetts Institute of Technology\) Press|Yourdon Press Computing Series|L. Davis Press|Ivy Press Books|Moscow Univ. Press|Cambridge Univ. Press|Presses du CNRS|Presses de l'Ecole nationale des ponts et chaussees|Academic Press Inc|Cambridge UniversityPress|Vantage Press Inc|Massachusettes Institute of Technology \(MIT\) Press|IAP Press|Academic Press and OHM|IEEE Comput. Soc. Press|The Weizmann Science Press of Israel|MIT Press|Process Press|Blandford Press|Science Press|University of Tasmania Press|Vantage Press|Arno Press|Academic Press|University of Massachusetts Press|Delacorte Press/E. Friede|The Macmillan Press Ltd|Peebles Press|Anchor Press/Doubleday|Smithsonian Institution Press|Anchor Press / Doubleday|FAN Press|Univ. Calif. Press|Presses de la Cite|Ballena Press|Pica Press|University of Texas Press|Optosonic Press|University of Missouri Press|University of Alabama Press|Nauka Press|Exposition Press|Presses universitaire de France|Viking Press|Priory Press|Chemical Rubber Co. Press|Books for Libraries Press|Pragopress|Fundamental Research Press|University of California Press|University of Michigan Press|University of New Mexico Press|University of London Press|Natural History Press|Lenin Belorussian State University Press|Crowell-Collier Press|Greenwood Press|NEO Press|M.I.T. Press|Univ. Wisconsin Press|University of Chicago Press|University of Colorado Press|Brockhampton Press|Golden Press|Lutterworth Press|Trident Press Book|Beacon Press|St Martin's Press|Pageant Press|M. I. T. Press|Orion Press|The Univesrity of Chicago Press|Museum Press|Citadel Press|Pegasus Press|Childrens Press|Ronald Press Co|Majestic Press|Westernlore Press|The Science press printing company|The Technical press ltd|The Florida Bible institute press|The Sheldon press|Pacific Science Press|The Theosophical press|The Clarendon press|The Pilgrim press|The Hispanic Society of America and The De Vinne Press|Press of E. W. Stephen|The Nichols press|Press of T. P. Nichols|Press of J. Wilson and son|Roy. Acad. press)")
58+
4159
def __format_date(self, solr_date, export_format):
4260
"""
4361
@@ -350,6 +368,10 @@ def __get_fields(self, export_format):
350368
('pub_raw', 'dc:source'), ('pubdate', 'dc:date'), ('keyword', 'dc:subject'),
351369
('copyright', 'dc:rights'), ('url', 'dc:relation'), ('num_citations', 'dc:relation'),
352370
('abstract', 'dc:description'), ('doi', 'dc:identifier')]
371+
elif (export_format == self.EXPORT_FORMAT_JATS_XML):
372+
fields = [('doctype', ''), ('author', ''), ('year', ''), ('title', ''),
373+
('pub', ''), ('pub_raw', ''), ('volume', 'volume'), ('issue', 'issue'),
374+
('editor', ''), ('publisher', ''), ('page', ''), ('page_range', ''), ('doi', '')]
353375
else:
354376
fields = []
355377
return OrderedDict(fields)
@@ -523,6 +545,226 @@ def __get_doc_reference_xml(self, index, parent, export_format):
523545
self.__add_in(record, fields[field], get_eprint(a_doc))
524546

525547

548+
def __add_person_group_jats_xml(self, person_list, record, person_group_type):
549+
"""
550+
add author or editors for JATS xml format
551+
552+
:param record:
553+
:param person_group_type:
554+
:return:
555+
"""
556+
if person_list:
557+
# add outter tag
558+
person_group_record = ET.SubElement(record, 'person-group')
559+
person_group_record.set('person-group-type', person_group_type)
560+
# now add inner tag
561+
for person in person_list:
562+
separate = person.split(',')
563+
# author might not have first name
564+
if len(separate) >= 1:
565+
person_record = ET.SubElement(person_group_record, 'string-name')
566+
ET.SubElement(person_record, 'surname').text = separate[0].strip()
567+
if len(separate) == 2:
568+
ET.SubElement(person_record, 'given-names').text = '%s.'%separate[1].strip()[0]
569+
# add role tag if this is editor type
570+
if (person_group_type == 'editor'):
571+
ET.SubElement(record, 'role').text = 'Eds.'
572+
573+
574+
def __add_title_jats_xml(self, title, record, publication_type, lookahead):
575+
"""
576+
format title basded on JATS publication type
577+
578+
:param title:
579+
:param record:
580+
:param publication_type:
581+
:param lookahead:
582+
:return:
583+
"""
584+
title = ';'.join(title)
585+
586+
# <article-title>title</article-title>.
587+
if publication_type in ['journal', 'report']:
588+
title_record = ET.SubElement(record, 'article-title')
589+
title_record.text = title
590+
title_record.tail = '.\n'
591+
# <article-title>title</article-title>,
592+
elif publication_type == 'confproc':
593+
title_record = ET.SubElement(record, 'article-title')
594+
title_record.text = title
595+
title_record.tail = ',\n'
596+
# book: <source><italic>title</italic></source>
597+
# book with editor: <source><italic>title</italic></source>;
598+
elif publication_type == 'book':
599+
title_record = ET.SubElement(record, 'title')
600+
ET.SubElement(title_record, 'italic').text = title
601+
# is set to true if the record has editor and needs to have semicolon at the end
602+
if lookahead:
603+
title_record.tail = ';\n'
604+
# <source>Ph.D. thesis</source>
605+
elif publication_type == 'thesis':
606+
ET.SubElement(record, 'source').text = 'Ph.D. thesis'
607+
# <source>title</source>.
608+
elif publication_type in ['software', 'review', 'other']:
609+
ET.SubElement(record, 'source').text = title
610+
611+
612+
def __add_conf_proc_info_jats_xml(self, pub_raw, record):
613+
"""
614+
for confproc publication type, jats format the following four tags are needed to be filled
615+
<conf-name>usually the first or second substring</conf-name>,
616+
<conf-loc>city/country usually appears following conference name</conf-loc>,
617+
<month>most records do not have the month of conference so for now ignore</month>
618+
<year>year of the conference usually appears in pub_raw</year>.
619+
620+
:param pub_raw:
621+
:param record:
622+
:return:
623+
"""
624+
# see if the year appears in pub_raw
625+
year = None
626+
match = self.re_year.search(pub_raw)
627+
if match:
628+
year = match.group(1)
629+
630+
# see if the location is in pub_raw
631+
location = ''
632+
places = GeoText(pub_raw)
633+
if places.cities:
634+
location = places.cities
635+
if places.countries:
636+
if location:
637+
location += ', '
638+
location += places.countries
639+
640+
# now split the pub_raw and try to see if conference name can be inferred
641+
conference = [s for s in list(reader([pub_raw]))[0] if 'conference' in s.lower()]
642+
if conference:
643+
ET.SubElement(record, 'conf-name').text = conference[0]
644+
if location:
645+
ET.SubElement(record, 'conf-loc').text = location
646+
if year:
647+
ET.SubElement(record, 'year').text = location
648+
649+
650+
def __add_book_publisher_info_jats_xml(self, pub_raw, record):
651+
"""
652+
for book publication type, jats format the following four tags are needed to be filled
653+
<publisher-loc>in pub_raw</publisher-loc>:
654+
<publisher-name>in pub_raw</publisher-name>.
655+
656+
:param pub_raw:
657+
:param record:
658+
:return:
659+
"""
660+
# see if the location is in pub_raw
661+
location = ''
662+
places = GeoText(pub_raw)
663+
if places.cities:
664+
location = places.cities
665+
if places.countries:
666+
if location:
667+
location += ', '
668+
location += places.countries
669+
670+
publisher = ''
671+
match = self.re_publisher_names.search(pub_raw, re.IGNORECASE)
672+
if match:
673+
publisher = match.group(1)
674+
675+
if publisher:
676+
if location:
677+
location_record = ET.SubElement(record, 'publisher-loc')
678+
location_record.text = location
679+
location_record.tail = ': '
680+
ET.SubElement(record, 'publisher-name').text = publisher
681+
682+
def __get_doc_jats_xml(self, index, parent):
683+
"""
684+
for each document from Solr, get the fields, and format them accordingly for JATS format
685+
686+
:param index:
687+
:param parent:
688+
:return:
689+
"""
690+
ads_to_jats_doctype_mapping = {
691+
'book': 'book', 'inproceedings': 'book', 'inbook': 'book',
692+
'proceedings':'confproc',
693+
'article': 'journal', 'abstract': 'journal', 'eprint': 'journal',
694+
'phdthesis': 'thesis', 'mastersthesis': 'thesis',
695+
'software': 'software',
696+
'techreport': 'report',
697+
'bookreview': 'review',
698+
'circular': 'other', 'editorial': 'other', 'erratum': 'other', 'misc': 'other', 'catalog': 'other',
699+
'newsletter': 'other', 'obituary': 'other', 'pressrelease': 'other', 'proposal': 'other', 'talk': 'other',
700+
}
701+
a_doc = self.from_solr['response'].get('docs')[index]
702+
fields = self.__get_fields(self.EXPORT_FORMAT_JATS_XML)
703+
704+
# add outter tag and label for this reference
705+
ref = ET.SubElement(parent, 'ref', id='CIT%03d'%(index+1))
706+
ET.SubElement(ref, 'label').text = '%d.'%(index+1)
707+
708+
publication_type = ''
709+
for field in fields:
710+
if not a_doc.get(field, None):
711+
continue
712+
713+
if (field == 'doctype'):
714+
publication_type = ads_to_jats_doctype_mapping[a_doc.get(field, '')]
715+
record = ET.SubElement(ref, 'mixed-citation')
716+
record.set('publication-type', publication_type)
717+
elif (field == 'author') or (field == 'editor'):
718+
self.__add_person_group_jats_xml(a_doc.get(field, []), record, field)
719+
elif (field == 'year'):
720+
# year appears in parenthesis, so need to find the last element and add open parenthesis
721+
if record:
722+
record[-1].tail = '\n('
723+
else:
724+
record.text = '\n('
725+
year = ET.SubElement(record, 'year')
726+
year.text = a_doc.get(field, '')
727+
# now add the close parenthesis
728+
year.tail = ')\n'
729+
elif (field == 'title'):
730+
self.__add_title_jats_xml(a_doc.get(field, ''), record, publication_type, a_doc.get('editor', None))
731+
elif (field == 'pub'):
732+
if (publication_type == 'journal'):
733+
source_record = ET.SubElement(record, 'source')
734+
ET.SubElement(source_record, 'italic').text = a_doc.get(field, '')
735+
elif (field == 'pub_raw'):
736+
if (publication_type == 'confproc'):
737+
self.__add_conf_proc_info_jats_xml(a_doc.get(field, ''), record)
738+
# TODO: once solr contains publisher info need to remove extracting publisher from pub_raw
739+
elif (publication_type == 'book') or (publication_type == 'report'):
740+
self.__add_book_publisher_info_jats_xml(a_doc.get(field, ''), record)
741+
elif (field == 'volume'):
742+
ET.SubElement(record, 'volume').text = a_doc.get(field, '')
743+
elif (field == 'issue'):
744+
# issue appears in parenthesis, so need to find the last element and add open parenthesis
745+
record[-1].tail = '(\n'
746+
issue = ET.SubElement(record, 'issue')
747+
issue.text = a_doc.get(field, '')
748+
# now add the close parenthesis followed by colon
749+
issue.tail = '):'
750+
elif (field == 'page'):
751+
ET.SubElement(record, 'fpage').text = ''.join(a_doc.get(field, ''))
752+
elif (field == 'page_range'):
753+
pages = ''.join(a_doc.get('page_range', '')).split('-')
754+
if len(pages) == 2:
755+
# need to insert a dash before lastpage
756+
record[-1].tail = ' #x2013;'
757+
ET.SubElement(record, 'lpage').text = pages[1]
758+
# insert a dot after page info
759+
record[-1].tail = '.\n'
760+
elif (field == 'doi'):
761+
# need to add `doi:` before tag
762+
record[-1].tail = ' doi:'
763+
doi = ET.SubElement(record, 'pub-id')
764+
doi.set('pub-id-type', 'doi')
765+
doi.text = ''.join(a_doc.get(field, ''))
766+
767+
526768
def __get_xml(self, export_format):
527769
"""
528770
setup the outer xml structure
@@ -550,6 +792,9 @@ def __get_xml(self, export_format):
550792
elif (export_format == self.EXPORT_FORMAT_DUBLIN_XML):
551793
for index in range(num_docs):
552794
self.__get_doc_dublin_xml(index, records)
795+
elif (export_format == self.EXPORT_FORMAT_JATS_XML):
796+
for index in range(num_docs):
797+
self.__get_doc_jats_xml(index, records)
553798
format_xml = ET.tostring(records, encoding='utf8', method='xml')
554799
format_xml = (b'>\n<'.join(format_xml.split(b'><')))
555800
format_xml = format_xml.replace(b'</record>', b'</record>\n')
@@ -577,3 +822,11 @@ def get_dublincore_xml(self):
577822
"""
578823
return self.__get_xml(self.EXPORT_FORMAT_DUBLIN_XML)
579824

825+
826+
def get_jats_xml(self):
827+
"""
828+
829+
:return: jats xml format
830+
"""
831+
return self.__get_xml(self.EXPORT_FORMAT_JATS_XML)
832+

‎exportsrv/tests/unittests/stubdata/xmlTest.py‎

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

‎exportsrv/tests/unittests/test_export_service.py‎

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,12 @@ def test_refxml_with_abs(self):
108108
# now compare it with an already formatted data that we know is correct
109109
assert(xml_export == xmlTest.data_ref_with_abs)
110110

111+
def test_jatsxml(self):
112+
# format the stubdata using the code
113+
xml_export = XMLFormat(solrdata.data).get_jats_xml()
114+
# now compare it with an already formatted data that we know is correct
115+
assert(xml_export == xmlTest.data_jats)
116+
111117
def test_aastex(self):
112118
# format the stubdata using the code
113119
csl_export = CSL(CSLJson(solrdata.data).get(), 'aastex', adsFormatter.latex).get()

0 commit comments

Comments
 (0)