将较大的 .bib 文件拆分为每个条目一个 .bib 文件

Question 1

如果您使用 R，您可能希望将 .bib 文件中的数据解析为原生 R 对象，这样就无需为每个 bib 项创建文件。我创建了一个要点，在此处显示了 R 代码： https://gist.github.com/jmclawson/def66ac8635db9c6131c3f3ae092f6e5

以下是结果数据框的屏幕截图：

Answer

如果您使用 R，您可能希望将 .bib 文件中的数据解析为原生 R 对象，这样就无需为每个 bib 项创建文件。我创建了一个要点，在此处显示了 R 代码： https://gist.github.com/jmclawson/def66ac8635db9c6131c3f3ae092f6e5

以下是结果数据框的屏幕截图：

Question 2

好吧，我自己找到了答案，答案如下：

使用 bibtool 和 bib2bib（来自 bibtex2html），我将其拆分，并且作为奖励是使用所有条目制作 data.frame 的 R 代码。

bibtool 中的 --‘print(xxx)’ 是 bibtool 中非常有用的格式（参见手册第 27 页），它可以以完全标准化的方式格式化 .bib，确保一切都正常。不幸的是，标准选项是在 77 个字符处用 \n 换行并使用制表符进行对齐，这对于解析为 R（长摘要、标题等）没有用。

# first make list of all entries for the loop
bib2bib -oc all-entries-list.bib  --no-comment library.bib

# then loop thru them
for xkey in $(cat all-entries-list.bib); do
  bibtool '--select{"^'$xkey'$"}' -i library.bib -o path/to/dir/"$xkey.bib" -- 'print.use.tab = {0}' -- 'print.line.length = {999999999}' 
done

这是 R 中的代码

# wd and libs
setwd('some/path/to/bibtex-entries')
library(fs)
library(data.table)
library(purrr)

# get all entries
fil1 <- dir_ls('input/singles-entries/work-bibliotek-entries/', regex='\\.bib')

clean_entries <- function(.x) {
        # .x <- fil1$path[[72]]; .x     
        # .x <- fil1[[72]]; .x      
        # .x <- fil1$path[[36]]; .x     
    # read lines and remove non-fields 
    x1 <- data.table(v1=readLines(.x)) %>% .[v1 %ilike% '@|=']
    # split into variabel name and variabel content
    # x1[,  (c('field', 'value')) := tstrsplit(v1, '^=+?')]
    x1[,  (c('field', 'value')) := tstrsplit(v1, '=(?= {)', perl=T)]
    x1[field %flike% '@', `:=` (
                field = 'entry-type',
                value = gsub('^@([A-Za-z]+).*$', '\\1',  v1))]

    # remove curlies and ending , from last line 
    x1[, value := gsub('(^[ ])\\{(.*)', '\\1\\2',  value)]
    x1[, value := gsub('(.+)\\},*$', '\\1',  value)]

    x2 <- data.table(x1$value) %>% data.table::transpose()  
    setnames(x2, x1$field)  
    x2
     # names(x2)
    # v(x2)  
}

# use safely() to run tru all of them and capture output or error without stopping 
bib1 <- map(fil1, safely(clean_entries))
map(bib1, 'error') %>% keep(~ !is.null(.x)) 

# rbind the list, with the name of the list element as column called 'citeky'
bib2 <- rbindlist( map(bib1, 'result'), idcol='citekey', fill=TRUE)
# remove path from citekey column
bib2[, citekey := gsub('^.*/(.*).bib$', '\\1', citekey)]

Answer

好吧，我自己找到了答案，答案如下：

使用 bibtool 和 bib2bib（来自 bibtex2html），我将其拆分，并且作为奖励是使用所有条目制作 data.frame 的 R 代码。

bibtool 中的 --‘print(xxx)’ 是 bibtool 中非常有用的格式（参见手册第 27 页），它可以以完全标准化的方式格式化 .bib，确保一切都正常。不幸的是，标准选项是在 77 个字符处用 \n 换行并使用制表符进行对齐，这对于解析为 R（长摘要、标题等）没有用。

# first make list of all entries for the loop
bib2bib -oc all-entries-list.bib  --no-comment library.bib

# then loop thru them
for xkey in $(cat all-entries-list.bib); do
  bibtool '--select{"^'$xkey'$"}' -i library.bib -o path/to/dir/"$xkey.bib" -- 'print.use.tab = {0}' -- 'print.line.length = {999999999}' 
done

这是 R 中的代码

# wd and libs
setwd('some/path/to/bibtex-entries')
library(fs)
library(data.table)
library(purrr)

# get all entries
fil1 <- dir_ls('input/singles-entries/work-bibliotek-entries/', regex='\\.bib')

clean_entries <- function(.x) {
        # .x <- fil1$path[[72]]; .x     
        # .x <- fil1[[72]]; .x      
        # .x <- fil1$path[[36]]; .x     
    # read lines and remove non-fields 
    x1 <- data.table(v1=readLines(.x)) %>% .[v1 %ilike% '@|=']
    # split into variabel name and variabel content
    # x1[,  (c('field', 'value')) := tstrsplit(v1, '^=+?')]
    x1[,  (c('field', 'value')) := tstrsplit(v1, '=(?= {)', perl=T)]
    x1[field %flike% '@', `:=` (
                field = 'entry-type',
                value = gsub('^@([A-Za-z]+).*$', '\\1',  v1))]

    # remove curlies and ending , from last line 
    x1[, value := gsub('(^[ ])\\{(.*)', '\\1\\2',  value)]
    x1[, value := gsub('(.+)\\},*$', '\\1',  value)]

    x2 <- data.table(x1$value) %>% data.table::transpose()  
    setnames(x2, x1$field)  
    x2
     # names(x2)
    # v(x2)  
}

# use safely() to run tru all of them and capture output or error without stopping 
bib1 <- map(fil1, safely(clean_entries))
map(bib1, 'error') %>% keep(~ !is.null(.x)) 

# rbind the list, with the name of the list element as column called 'citeky'
bib2 <- rbindlist( map(bib1, 'result'), idcol='citekey', fill=TRUE)
# remove path from citekey column
bib2[, citekey := gsub('^.*/(.*).bib$', '\\1', citekey)]

将较大的 .bib 文件拆分为每个条目一个 .bib 文件

答案1

答案2

相关内容