通过 DOI 更新 Bibtex 字段

Question

同时，我还没有编写一个小型的 pyhton 程序，它完全可以满足我的需要：

它抓取整个 bibtexml 文件，从 doi 源检索期刊标题，并用过时的条目替换该条目。如果无法自动检索该条目，小脚本会尝试偶然执行此任务：

from xml.dom import minidom
import subprocess
import urllib.request
import urllib.error
import json
import codecs

xmldoc = minidom.parse('library.xml')
itemlist = xmldoc.getElementsByTagName('entry') 

print('Number of items: ' + str(itemlist.length))
print("")

for s in itemlist :
    print("Current item: " + str(s.attributes['id'].value))

    # Convert from PMID to DOI
    if len(s.getElementsByTagName('doi')) != 1 and len(s.getElementsByTagName('pmid')) >= 1:
        print("DOI not available, but PMID found...")
        pmid_s = s.getElementsByTagName('pmid')[0].firstChild.data
        req = urllib.request.Request("http://www.pmid2doi.org/rest/json/doi/" + pmid_s, method='GET', headers={'Accept': 'application/json','Content-Type': 'application/json; charset=UTF-8'})
        try:
            json_s = urllib.request.urlopen(req).read().decode("utf-8")
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print("No web resources available :(")

                if len(s.getElementsByTagName('journal')) >= 1:
                    journal_node = s.getElementsByTagName('journal')
                    journal_node_s = journal_node[0].firstChild.data

                    journal_title = journal_node_s.title()
                    journal_title = journal_title.replace(" Of ", " of ")
                    journal_title = journal_title.replace(" The ", " the ")
                    journal_title = journal_title.replace(" And ", " and ")
                    journal_title = journal_title.replace(" In ", " in ")

                    journal_node[0].firstChild.data = journal_title

                    print("Fix it manually: " + journal_node_s + " --> " + journal_title)

                print("")
                continue
            else:
                raise
        json_l = json.loads(json_s)
        doi_s = json_l['doi']
        print("Convert: " + pmid_s + " --> " + doi_s)
        doi = xmldoc.createElement("doi")
        doi_text = xmldoc.createTextNode(doi_s)
        doi.appendChild(doi_text)
        s.childNodes[1].appendChild(doi)

    # Crawl data by DOI
    if len(s.getElementsByTagName('doi')) >= 1:
        doi = s.getElementsByTagName('doi')
        doi_s = doi[0].firstChild.data
        print("DOI: " + doi_s)

        req = urllib.request.Request("http://dx.doi.org/" + doi_s, headers={"Accept" : "application/vnd.crossref.unixref+xml"})
        try:
            contdoc_s = urllib.request.urlopen(req)
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print("No web resources available :(")
                if len(s.getElementsByTagName('journal')) >= 1:
                    journal_node = s.getElementsByTagName('journal')
                    journal_node_s = journal_node[0].firstChild.data

                    journal_title = journal_node_s.title()
                    journal_title = journal_title.replace(" Of ", " of ")
                    journal_title = journal_title.replace(" The ", " the ")
                    journal_title = journal_title.replace(" And ", " and ")
                    journal_title = journal_title.replace(" In ", " in ")

                    journal_node[0].firstChild.data = journal_title

                    print("Fix it manually: " + journal_node_s + " --> " + journal_title)

                print("")
                continue
            else:
                raise

        contdoc = minidom.parse(contdoc_s)

        # Parse journal_metadata node from crawled data
        if len(contdoc.getElementsByTagName('journal_metadata')) >= 1:
            journal_metadata = contdoc.getElementsByTagName('journal_metadata')[0]
            journal_title = journal_metadata.getElementsByTagName('full_title')[0].firstChild.data
            print("Fetched journal title: " + journal_title)

            # Link parsed data to existing base data
            if len(s.getElementsByTagName('journal')) >= 1:
                journal_node = s.getElementsByTagName('journal')
                journal_node_s = journal_node[0].firstChild.data

                print("Update: " + journal_node_s + " --> " + journal_title)
                journal_node[0].firstChild.data = journal_title

    else:
        print("Item has neither DOI nor PMID. Skip it...")

    print("")

xml = xmldoc.toxml(encoding="utf-8")
f = open("library-out.xml", "wb")
#xmldoc.writexml(f)
f.write(xml)
f.close()

这个 Python 脚本是为 Python 3.x 编写的

可以使用此工具将 bibtex 文件转换为 bibtexml 文件：http://sourceforge.net/projects/bibtexml/files/

Answer 1

同时，我还没有编写一个小型的 pyhton 程序，它完全可以满足我的需要：

它抓取整个 bibtexml 文件，从 doi 源检索期刊标题，并用过时的条目替换该条目。如果无法自动检索该条目，小脚本会尝试偶然执行此任务：

from xml.dom import minidom
import subprocess
import urllib.request
import urllib.error
import json
import codecs

xmldoc = minidom.parse('library.xml')
itemlist = xmldoc.getElementsByTagName('entry') 

print('Number of items: ' + str(itemlist.length))
print("")

for s in itemlist :
    print("Current item: " + str(s.attributes['id'].value))

    # Convert from PMID to DOI
    if len(s.getElementsByTagName('doi')) != 1 and len(s.getElementsByTagName('pmid')) >= 1:
        print("DOI not available, but PMID found...")
        pmid_s = s.getElementsByTagName('pmid')[0].firstChild.data
        req = urllib.request.Request("http://www.pmid2doi.org/rest/json/doi/" + pmid_s, method='GET', headers={'Accept': 'application/json','Content-Type': 'application/json; charset=UTF-8'})
        try:
            json_s = urllib.request.urlopen(req).read().decode("utf-8")
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print("No web resources available :(")

                if len(s.getElementsByTagName('journal')) >= 1:
                    journal_node = s.getElementsByTagName('journal')
                    journal_node_s = journal_node[0].firstChild.data

                    journal_title = journal_node_s.title()
                    journal_title = journal_title.replace(" Of ", " of ")
                    journal_title = journal_title.replace(" The ", " the ")
                    journal_title = journal_title.replace(" And ", " and ")
                    journal_title = journal_title.replace(" In ", " in ")

                    journal_node[0].firstChild.data = journal_title

                    print("Fix it manually: " + journal_node_s + " --> " + journal_title)

                print("")
                continue
            else:
                raise
        json_l = json.loads(json_s)
        doi_s = json_l['doi']
        print("Convert: " + pmid_s + " --> " + doi_s)
        doi = xmldoc.createElement("doi")
        doi_text = xmldoc.createTextNode(doi_s)
        doi.appendChild(doi_text)
        s.childNodes[1].appendChild(doi)

    # Crawl data by DOI
    if len(s.getElementsByTagName('doi')) >= 1:
        doi = s.getElementsByTagName('doi')
        doi_s = doi[0].firstChild.data
        print("DOI: " + doi_s)

        req = urllib.request.Request("http://dx.doi.org/" + doi_s, headers={"Accept" : "application/vnd.crossref.unixref+xml"})
        try:
            contdoc_s = urllib.request.urlopen(req)
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print("No web resources available :(")
                if len(s.getElementsByTagName('journal')) >= 1:
                    journal_node = s.getElementsByTagName('journal')
                    journal_node_s = journal_node[0].firstChild.data

                    journal_title = journal_node_s.title()
                    journal_title = journal_title.replace(" Of ", " of ")
                    journal_title = journal_title.replace(" The ", " the ")
                    journal_title = journal_title.replace(" And ", " and ")
                    journal_title = journal_title.replace(" In ", " in ")

                    journal_node[0].firstChild.data = journal_title

                    print("Fix it manually: " + journal_node_s + " --> " + journal_title)

                print("")
                continue
            else:
                raise

        contdoc = minidom.parse(contdoc_s)

        # Parse journal_metadata node from crawled data
        if len(contdoc.getElementsByTagName('journal_metadata')) >= 1:
            journal_metadata = contdoc.getElementsByTagName('journal_metadata')[0]
            journal_title = journal_metadata.getElementsByTagName('full_title')[0].firstChild.data
            print("Fetched journal title: " + journal_title)

            # Link parsed data to existing base data
            if len(s.getElementsByTagName('journal')) >= 1:
                journal_node = s.getElementsByTagName('journal')
                journal_node_s = journal_node[0].firstChild.data

                print("Update: " + journal_node_s + " --> " + journal_title)
                journal_node[0].firstChild.data = journal_title

    else:
        print("Item has neither DOI nor PMID. Skip it...")

    print("")

xml = xmldoc.toxml(encoding="utf-8")
f = open("library-out.xml", "wb")
#xmldoc.writexml(f)
f.write(xml)
f.close()

这个 Python 脚本是为 Python 3.x 编写的

可以使用此工具将 bibtex 文件转换为 bibtexml 文件：http://sourceforge.net/projects/bibtexml/files/

通过 DOI 更新 Bibtex 字段

答案1

相关内容