网页抓取/爬取特定的 Google 图书

Question 1

如果我在 Google 上搜索download "Gazetteer of the World" dictionary of geographic knowledge，我会看到它以 PDF 格式提供，然后可以使用 PDF2Word 之类的工具来提取文本；

除非 PDF 全部都是图片 ;-) 然后您可以尝试使用 pdf2jpg 进行提取并将图像文件输入 OCR 程序。

您也可以购买它们（亚马逊有几种畅销产品），剪下页面然后将其放入具有自动进纸和 OCR 功能的扫描仪中。

由于这是一次性的努力，编程将是我的最后手段。

如果您首先对能够获取（6？）卷的格式进行盘点，并估算处理这些格式所需的成本和工作量，这可能会有所帮助。

Answer

如果我在 Google 上搜索download "Gazetteer of the World" dictionary of geographic knowledge，我会看到它以 PDF 格式提供，然后可以使用 PDF2Word 之类的工具来提取文本；

除非 PDF 全部都是图片 ;-) 然后您可以尝试使用 pdf2jpg 进行提取并将图像文件输入 OCR 程序。

您也可以购买它们（亚马逊有几种畅销产品），剪下页面然后将其放入具有自动进纸和 OCR 功能的扫描仪中。

由于这是一次性的努力，编程将是我的最后手段。

如果您首先对能够获取（6？）卷的格式进行盘点，并估算处理这些格式所需的成本和工作量，这可能会有所帮助。

Question 2

我通过在 node.js 中编写一个小程序（称为 extract.js）来抓取文本来解决了这个问题。我使用了此页面来帮助我：http://blog.miguelgrinberg.com/post/easy-web-scraping-with-nodejs

每个 html 页面包含多个书页。因此，如果我们只将 url 中的页面参数增加 1，那么如果我们不小心，我们就会抓取重复的书页（这是我特别困惑的部分）。我通过使用 jquery 选择器来解决这个问题，只选择 url 中指定的单个书页，并忽略 html 中存在的其他书页。这样，我可以使用电子表格程序快速构建一个文本文件，其中按顺序排列每个单页的 url（因为增量只有 1）。

到目前为止，我已经成功抓取了前两卷，还有五卷要抓取！代码如下，它可以作为抓取其他 Google 图书的有用起点。

// Usage: node extract.js input output
// where input (mandatory) is the text file containing your list of urls
// and output (optional) is the directory where the output files will be saved

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

// Read the command line parameters
var input = process.argv[2];
var output = process.argv[3];

if (!input) {
  console.log("Missing input parameter");
  return;
}

// Read the url input file, each url is on a new line
var urls = fs.readFileSync(input).toString().split('\n');

// Check for non urls and remove
for (var i = 0; i < urls.length; i++) {
  if (urls[i].slice(0, 4) != 'http') {
    urls.splice(i, 1);
  }
}

// Iterate through the urls
for (var i = 0; i < urls.length; i++) {
  var url = urls[i];

  // request function is asynchronous, hence requirement for self-executing function
  // Cannot guarantee the execution order of the callback for each url, therefore save results to separate files 
  request(url, ( function(url) {
            return function(err, resp, body) {
                if (err)
                    throw err;

                // Extract the pg parameter (book page) from the url
                // We will use this to only extract the text from this book page
                // because a retrieved html page contains multiple book pages
                var pg = url.slice(url.indexOf('pg=') + 3, url.indexOf('&output=text'));

                //
                // Define the filename
                //
                var number = pg.slice(2, pg.length);
                var zeroes = 4 - number.length;

                // Insert leading zeroes
                for (var j = 0; j < zeroes; j++) {
                  number = '0' + number;
                }  

                var filename = pg.slice(0, 2) + number + '.txt';

                // Add path to filename
                if (output) {
                  if (!fs.existsSync(output))
                    fs.mkdirSync(output);

                  filename = output + '/' + filename;
                }

                // Delete the file if it already exists
                if (fs.existsSync(filename))
                  fs.unlinkSync(filename);

                // Make the DOM available to jquery
                $ = cheerio.load(body);

                // Select the book page
                // Pages are contained within 'div' elements (where class='flow'),
                // each of which contains an 'a' element where id is equal to the page
                // Use ^ to match pages because sometimes page ids can have a trailing hyphen and extra characters
            var page = $('div.flow:has(a[id=' + pg + ']), div.flow:has(a[id^=' + pg + '-])');

            //
            // Extract and save the text of the book page to the file
            //

            var hasText = false;

            // Text is in 'gtxt_body', 'gtxt_column' and 'gtxt_footnote'
            page.find('div.gtxt_body, div.gtxt_column, div.gtxt_footnote').each(function() {  
              this.find('p.gtxt_body, p.gtxt_column, p.gtxt_footnote').each(function() {
                hasText = true;

                fs.appendFileSync(filename, this.text());
                fs.appendFileSync(filename, '\n\n');
              });
            });

                // Log progress
                if (hasText) {
                  console.log("Retrieved and saved page: " + pg);
                }
                else {
                  console.log("Skipping page: " + pg);
                }
            }
        } )(url));
}

Answer

我通过在 node.js 中编写一个小程序（称为 extract.js）来抓取文本来解决了这个问题。我使用了此页面来帮助我：http://blog.miguelgrinberg.com/post/easy-web-scraping-with-nodejs

每个 html 页面包含多个书页。因此，如果我们只将 url 中的页面参数增加 1，那么如果我们不小心，我们就会抓取重复的书页（这是我特别困惑的部分）。我通过使用 jquery 选择器来解决这个问题，只选择 url 中指定的单个书页，并忽略 html 中存在的其他书页。这样，我可以使用电子表格程序快速构建一个文本文件，其中按顺序排列每个单页的 url（因为增量只有 1）。

到目前为止，我已经成功抓取了前两卷，还有五卷要抓取！代码如下，它可以作为抓取其他 Google 图书的有用起点。

// Usage: node extract.js input output
// where input (mandatory) is the text file containing your list of urls
// and output (optional) is the directory where the output files will be saved

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

// Read the command line parameters
var input = process.argv[2];
var output = process.argv[3];

if (!input) {
  console.log("Missing input parameter");
  return;
}

// Read the url input file, each url is on a new line
var urls = fs.readFileSync(input).toString().split('\n');

// Check for non urls and remove
for (var i = 0; i < urls.length; i++) {
  if (urls[i].slice(0, 4) != 'http') {
    urls.splice(i, 1);
  }
}

// Iterate through the urls
for (var i = 0; i < urls.length; i++) {
  var url = urls[i];

  // request function is asynchronous, hence requirement for self-executing function
  // Cannot guarantee the execution order of the callback for each url, therefore save results to separate files 
  request(url, ( function(url) {
            return function(err, resp, body) {
                if (err)
                    throw err;

                // Extract the pg parameter (book page) from the url
                // We will use this to only extract the text from this book page
                // because a retrieved html page contains multiple book pages
                var pg = url.slice(url.indexOf('pg=') + 3, url.indexOf('&output=text'));

                //
                // Define the filename
                //
                var number = pg.slice(2, pg.length);
                var zeroes = 4 - number.length;

                // Insert leading zeroes
                for (var j = 0; j < zeroes; j++) {
                  number = '0' + number;
                }  

                var filename = pg.slice(0, 2) + number + '.txt';

                // Add path to filename
                if (output) {
                  if (!fs.existsSync(output))
                    fs.mkdirSync(output);

                  filename = output + '/' + filename;
                }

                // Delete the file if it already exists
                if (fs.existsSync(filename))
                  fs.unlinkSync(filename);

                // Make the DOM available to jquery
                $ = cheerio.load(body);

                // Select the book page
                // Pages are contained within 'div' elements (where class='flow'),
                // each of which contains an 'a' element where id is equal to the page
                // Use ^ to match pages because sometimes page ids can have a trailing hyphen and extra characters
            var page = $('div.flow:has(a[id=' + pg + ']), div.flow:has(a[id^=' + pg + '-])');

            //
            // Extract and save the text of the book page to the file
            //

            var hasText = false;

            // Text is in 'gtxt_body', 'gtxt_column' and 'gtxt_footnote'
            page.find('div.gtxt_body, div.gtxt_column, div.gtxt_footnote').each(function() {  
              this.find('p.gtxt_body, p.gtxt_column, p.gtxt_footnote').each(function() {
                hasText = true;

                fs.appendFileSync(filename, this.text());
                fs.appendFileSync(filename, '\n\n');
              });
            });

                // Log progress
                if (hasText) {
                  console.log("Retrieved and saved page: " + pg);
                }
                else {
                  console.log("Skipping page: " + pg);
                }
            }
        } )(url));
}

网页抓取/爬取特定的 Google 图书

答案1

答案2

相关内容