是否有一个工具可以读取.bib
文件并为所有没有 DOI 字段的条目添加正确的 DOI 字段?
我当前的工作流程是在需要时使用 emacs 中的 auctex 添加引文。因此,使用命令行工具就足够了。我宁愿不将文件加载到某些书目管理器(如 jabref)中,因为它会添加所有这些多余的字段(如“所有者”和“时间戳”),而这些字段毫无用处。我想,有了文件中所有的书目信息,通过一些明智的数据库搜索应该很容易识别正确的 DOI...
答案1
我按照 user13348 的建议,使用他的请求函数编写了一个 python3 脚本,该脚本接受一个 bib 文件并输出一个包含其找到的 DOI 的新 bib 文件。我没有使用 bibtool 或接受任何辅助文件。
要求是 bibtexparser 和 unidecode。
#!/usr/bin/env python
import sys, re
from unidecode import unidecode
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
import http.client as httplib
import urllib
# Search for the DOI given a title; e.g. "computation in Noisy Radio Networks"
# Credit to user13348, slight modifications
# http://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography
def searchdoi(title, author):
params = urllib.parse.urlencode({"titlesearch":"titlesearch", "auth2" : author, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"})
headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"}
# conn = httplib.HTTPConnection("www.crossref.org:80") # Not working any more, HTTPS required
conn = httplib.HTTPSConnection("www.crossref.org")
conn.request("POST", "/guestquery/", params, headers)
response = conn.getresponse()
#print(response.status, response.reason)
data = response.read()
conn.close()
return re.search(r'doi\.org/([^"^<^>]+)', str(data))
def normalize(string):
"""Normalize strings to ascii, without latex."""
string = re.sub(r'[{}\\\'"^]',"", string)
string = re.sub(r"\$.*?\$","",string) # better remove all math expressions
return unidecode(string)
def get_authors(entry):
"""Get a list of authors' or editors' last names."""
def get_last_name(authors):
for author in authors :
author = author.strip(" ")
if "," in author:
yield author.split(",")[0]
elif " " in author:
yield author.split(" ")[-1]
else:
yield author
try:
authors = entry["author"]
except KeyError:
authors = entry["editor"]
authors = normalize(authors).split("and")
return list(get_last_name(authors))
print("Reading Bibliography...")
with open(sys.argv[1]) as bibtex_file:
bibliography = bibtexparser.load(bibtex_file)
print("Looking for Dois...")
before = 0
new = 0
total = len(bibliography.entries)
for i,entry in enumerate(bibliography.entries):
print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="")
try:
if "doi" not in entry or entry["doi"].isspace():
title = entry["title"]
authors = get_authors(entry)
for author in authors:
doi_match = searchdoi(title,author)
if doi_match:
doi = doi_match.groups()[0]
entry["doi"] = doi
new += 1
else:
before += 1
except:
pass
print("")
template="We added {new} DOIs !\nBefore: {before}/{total} entries had DOI\nNow: {after}/{total} entries have DOI"
print(template.format(new=new,before=before,after=before+new,total=total))
outfile = sys.argv[1]+"_doi.bib"
print("Writing result to ",outfile)
writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(outfile, 'w') as bibfile:
bibfile.write(writer.write(bibliography))
你可以这样使用它:
python3 searchdoi.py test.bib
它看起来会像这样:
Reading Bibliography...
Looking for Dois...
161/162 entries processed, please wait...
We added 49 DOIs !
Before: 42/162 entries had DOI
Now: 91/162 entries have DOI
Writing result to test.bib_doi.bib
您现在可以检查 test.bib_doi.bib。
答案2
我编写了一个简短的程序,它接收 BIB+AUX 文件并输出一个包含所有 DOI 的 HTML 文件,这些 DOI 是通过查询 CrossRef 获得的;请参阅下面的代码。如果能修改该程序,将 DOI 写回到原始 BIB 文件,那就太好了。如果有人愿意帮忙,请告诉我!
该程序使用 bibtool 以及 Python 包 zs.bibtex.parser。安装完成后,使用以下文件(忽略 bibtool 关于递归限制的警告):
1)“finddoi.sh”
bibtool -x $1.aux -o temp.bib -r formatting.txt python finddoi.py temp.bib | tee doi_output.html
2)“formatting.txt”(bibtool 使用)
key.number.separator = "" fmt.et.al = "" key.format = {short} expand.macros = ON delete.field {month} print.line.length = 1000 print.braces = OFF fmt.name.name = "" new.format.type = { 17="%l " } rewrite.rule { author # ".*" # "\"%100.17p(author)\"" }
3)“finddoi.py”
#!/usr/bin/env python import httplib, urllib, re, sys, cgi from zs.bibtex.parser import parse_string # Search for the DOI given a title; e.g. "computation in Noisy Radio Networks" def searchdoi(title, author): params = urllib.urlencode({"titlesearch":"titlesearch", "auth2" : author, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"}) headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"} conn = httplib.HTTPConnection("www.crossref.org:80") conn.request("POST", "/guestquery/", params, headers) response = conn.getresponse() # print response.status, response.reason data = response.read() conn.close() return data # Main body f = open(sys.argv[1], 'r') inputdata = f.read() # remove any leftover commas otherwise Bibtex parser crashed inputdata = re.sub(r",(\s*})",r"\1", inputdata) try: bibliography = parse_string(inputdata) except: err = sys.exc_info()[1] print "Unexpected parsing error:", err sys.exit() for paper in bibliography: try: title = bibliography[paper]['title'] author = bibliography[paper]['author'] if (isinstance(author,list)): author = author[0] author = str(author) author = re.sub(r"[{}'\\]","", author) # remove any of the characters that might confuse CrossRef title = re.sub(r"[{}]","", title) title = re.sub(r"\$.*?\$","",title) # better remove all math expressions title = re.sub(r"[^a-zA-Z0-9 ]", " ", title) print "<h1>DOIs for:<br>Title: %s<br>Author: %s<br> </h1>" % (title, author) out = searchdoi(title,author) result = re.findall(r"\<table cellspacing=1 cellpadding=1 width=600 border=0\>.*?\<\/table\>" ,out, re.DOTALL) if (len(result) > 0): print(result[0]) else: print("Bad response from server<br><br>") except: print "Error with: ", bibliography[paper]
答案3
基于 XaraB 的工作,这里有一个使用 crossref API 直接获取 JSON 对象的工作实现。它还具有一些基本的错误处理和一些重试逻辑,以解决 crossref API 中当前的错误,该错误会导致查询有时失败。
#!/usr/bin/env python
import sys
import re
from unidecode import unidecode
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
import requests
import urllib
import time
# Search for the DOI given a title; e.g. "computation in Noisy Radio Networks"
# Credit to user13348, slight modifications
# http://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography
#
class DOIError(Exception):
pass
def searchdoi(title, author, tries=4):
params = urllib.parse.urlencode(
{"query.author": author, "query.title": title})
url_base = "http://api.crossref.org/works?"
trying = True
try_count = 0
while trying and try_count <= tries:
response = requests.get(url_base + params)
if response.ok:
trying = False
try:
doi = response.json()['message']['items'][0]['DOI']
except:
print("something wrong with json response for " + params)
raise DOIError
else:
try_count += 1
print("Response not 200 OK. Retrying, try " + str(try_count)
+ " of " + str(tries))
time.sleep(1)
if try_count >= tries:
raise DOIError("Tried more than " + str(tries) + " times. Response"
" still not 200 OK! Uh oh...")
return doi
#print(response.status, response.reason)
def normalize(string):
"""Normalize strings to ascii, without latex."""
string = re.sub(r'[{}\\\'"^]', "", string)
# better remove all math expressions
string = re.sub(r"\$.*?\$", "", string)
return unidecode(string)
def get_authors(entry):
"""Get a list of authors' or editors' last names."""
def get_last_name(authors):
for author in authors:
author = author.strip(" ")
if "," in author:
yield author.split(",")[0]
elif " " in author:
yield author.split(" ")[-1]
else:
yield author
try:
authors = entry["author"]
except KeyError:
authors = entry["editor"]
authors = normalize(authors).split("and")
return list(get_last_name(authors))
def main(bibtex_filename):
print("Reading Bibliography...")
with open(bibtex_filename) as bibtex_file:
bibliography = bibtexparser.load(bibtex_file)
print("Looking for Dois...")
before = 0
new = 0
total = len(bibliography.entries)
for i, entry in enumerate(bibliography.entries):
print("\r{i}/{total} entries processed, please wait...".format(i=i,
total=total), flush=True, end="")
try:
if "doi" not in entry or entry["doi"].isspace():
title = entry["title"]
authors = entry["author"]
try:
doi = searchdoi(title, authors)
entry["doi"] = doi
new += 1
except DOIError:
print("unable to find DOI for " + title)
else:
before += 1
except KeyError:
print("some issue with this entry! No title or no author")
print("")
template = "We added {new} DOIs !\nBefore: {before}/{total} entries had DOI\nNow: {after}/{total} entries have DOI"
print(
template.format(
new=new,
before=before,
after=before+new,
total=total))
outfile = bibtex_filename + "_doi.bib"
print("Writing result to ", outfile)
writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(outfile, 'w') as bibfile:
bibfile.write(writer.write(bibliography))
if __name__ == '__main__':
main(sys.argv