如何从cups-pdf生成的PDF中复制/粘贴文本？

Question 1

我确认了完全相同的行为，这是长期存在的“错误”[1] [2]，而“cups-pdf”的作者并不认为这是一个错误，因此现有的补丁不会在上游使用。

对于 Ubuntu，使用提到的 PPA 应该可以工作，但我目前无法测试它。

如果有人使用原始 Slackware，则需要更多技能。对于 Slackware 14.2 和 CUPS-PDF 3.0.1 [4]，[7] 中的补丁 [5][6] 将不起作用，它太旧了。在盲目试错期间，我尝试准备一个可行的方案[8]。

请注意，您应该手动安装它，然后非常小心地逐个文件安装新的“cups-pdf”；我无法以任何方式提供 SlackBuild 和 install.sh 来工作，我的技能太少了。但事实上，问题终于消失了。

您应该删除现有的 PDF 打印机，并使用手动提供的 CUPS-PDF_opt.ppd 添加一台打印机，也可以选择（以确保）删除 CUPS-PDF.ppd。

[1]https://www.linuxquestions.org/questions/linux-software-2/cannot-copy-text-from-pdfs-created-with-cups-pdf-4175440557/

[2]https://github.com/alexivkin/CUPS-PDF-to-PDF

[3]https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=658004

[4]https://slackbuilds.org/repository/14.2/office/cups-pdf/

[5]https://bugs.launchpad.net/ubuntu/+source/cups-pdf/+bug/820820/+attachment/3878188/+files/04_add_pdf_passthrough_support.patch

[6]https://launchpadlibrarian.net/153781216/04_add_pdf_passthrough_support.patch

[7]https://bugs.launchpad.net/ubuntu/+source/cups-pdf/+bug/820820

[8]

--- a/extra/CUPS-PDF_noopt.ppd
+++ b/extra/CUPS-PDF_noopt.ppd
@@ -31,7 +31,8 @@
*ModelName:     "Generic CUPS-PDF Printer"
*ShortNickName: "Generic CUPS-PDF Printer"
*NickName:      "Generic CUPS-PDF Printer (no options)"
-*1284DeviceID:  "MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:POSTSCRIPT;"
+*1284DeviceID:  "MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:PDF,POSTSCRIPT;"
+*cupsFilter:    "application/pdf 0 -"
*% cupsFilter:    "application/vnd.cups-postscript 0 pstitleiconv"
*PSVersion: "(2017.000) 0"
*LanguageLevel: "2"
--- a/extra/CUPS-PDF_opt.ppd
+++ b/extra/CUPS-PDF_opt.ppd
@@ -31,7 +31,8 @@
*ModelName:     "Generic CUPS-PDF Printer"
*ShortNickName: "Generic CUPS-PDF Printer"
*NickName:      "Generic CUPS-PDF Printer (w/ options)"
-*1284DeviceID:  "MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:POSTSCRIPT;"
+*1284DeviceID:  "MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:PDF,POSTSCRIPT;"
+*cupsFilter:    "application/pdf 0 -"
*% cupsFilter:    "application/vnd.cups-postscript 0 pstitleiconv"
*PSVersion: "(2017.000) 0"
*LanguageLevel: "2"
--- a/src/cups-pdf.c
+++ b/src/cups-pdf.c
@@ -60,6 +60,7 @@
extern int errno;

static FILE *logfp=NULL;
+int input_is_pdf=0;

static void log_event(short type, char message[], char *detail) {
    time_t secs;
@@ -427,7 +427,7 @@
    int len;
    cp_string setup;

-  printf("file cups-pdf:/ \"Virtual PDF Printer\" \"CUPS-PDF\" \"MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:POSTSCRIPT;\"\n");
+  printf("file cups-pdf:/ \"Virtual PDF Printer\" \"CUPS-PDF\" \"MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:PDF,POSTSCRIPT;\"\n");

    if ((dir = opendir(CP_CONFIG_PATH)) != NULL) {
        while ((config_ent = readdir(dir)) != NULL) {
@@ -460,6 +461,7 @@
    cp_string buffer;
    int rec_depth,is_title=0;
    FILE *fpdest;
+  size_t bytes = 0; 

    if (fpsrc == NULL) {
        log_event(CPERROR, "failed to open source stream", NULL);
@@ -483,14 +485,39 @@
        log_event(CPSTATUS, "***Experimental Option: FixNewlines");
    else
        log_event(CPDEBUG, "using traditional fgets");
-  while (fgets2(buffer, BUFSIZE, fpsrc) != NULL) {
+  while ((bytes = fread(buffer, sizeof(char), 4, fpsrc)) > 0) {
+    if (!strncmp(buffer, "%PDF", 4)) {
+      log_event(CPDEBUG, "found beginning of PDF code", buffer);
+      input_is_pdf=1;
+      break;
+    }
        if (!strncmp(buffer, "%!", 2) && strncmp(buffer, "%!PS-AdobeFont", 14)) {
            log_event(CPDEBUG, "found beginning of postscript code: %s", buffer);
            break;
        }
    }
-  log_event(CPDEBUG, "now extracting postscript code");
-  (void) fputs(buffer, fpdest);
+  log_event(CPDEBUG, "now extracting code");
+  fwrite(buffer, sizeof(char), bytes, fpdest);
+  if (input_is_pdf) {
+    while((bytes = fread(buffer, sizeof(char), BUFSIZE, fpsrc)) > 0)
+      fwrite(buffer, sizeof(char), bytes, fpdest);
+/* Commented out because decoding of utf16 PDF strings isn't implemented.
+    rewind(fpsrc);
+    while (fgets2(buffer, BUFSIZE, fpsrc) != NULL) {
+      if (!is_title) {
+        char *begin = strstr(buffer, "/Title");
+        if (begin) {
+          char *end = strstr(begin, ">");
+          if (end) {
+            strncpy(title, begin+6, BUFSIZE);
+            log_event(CPDEBUG, "found title in PDF code", title);
+            is_title=1;
+          }
+        }
+      }
+    }*/
+  }
+  else {
    while (fgets2(buffer, BUFSIZE, fpsrc) != NULL) {
        (void) fputs(buffer, fpdest);
        if (!is_title && !rec_depth)
@@ -513,6 +540,7 @@
            }
        }
    }
+  }
    (void) fclose(fpdest);
    (void) fclose(fpsrc);
    log_event(CPDEBUG, "all data written to spoolfile: %s", spoolfile);
@@ -761,7 +789,12 @@
            (void) fclose(logfp);
        return 5;
    }
-  snprintf(gscall, size, Conf_GSCall, Conf_GhostScript, Conf_PDFVer, outfile, spoolfile);
+  if (input_is_pdf) {
+    snprintf(gscall, size, "cp %s %s", spoolfile, outfile);
+  }
+  else {
+    snprintf(gscall, size, Conf_GSCall, Conf_GhostScript, Conf_PDFVer, outfile, spoolfile);
+  }
    log_event(CPDEBUG, "ghostscript commandline built: %s", gscall);

    (void) unlink(outfile);

Answer

我确认了完全相同的行为，这是长期存在的“错误”[1] [2]，而“cups-pdf”的作者并不认为这是一个错误，因此现有的补丁不会在上游使用。

对于 Ubuntu，使用提到的 PPA 应该可以工作，但我目前无法测试它。

如果有人使用原始 Slackware，则需要更多技能。对于 Slackware 14.2 和 CUPS-PDF 3.0.1 [4]，[7] 中的补丁 [5][6] 将不起作用，它太旧了。在盲目试错期间，我尝试准备一个可行的方案[8]。

请注意，您应该手动安装它，然后非常小心地逐个文件安装新的“cups-pdf”；我无法以任何方式提供 SlackBuild 和 install.sh 来工作，我的技能太少了。但事实上，问题终于消失了。

您应该删除现有的 PDF 打印机，并使用手动提供的 CUPS-PDF_opt.ppd 添加一台打印机，也可以选择（以确保）删除 CUPS-PDF.ppd。

[1]https://www.linuxquestions.org/questions/linux-software-2/cannot-copy-text-from-pdfs-created-with-cups-pdf-4175440557/

[2]https://github.com/alexivkin/CUPS-PDF-to-PDF

[3]https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=658004

[4]https://slackbuilds.org/repository/14.2/office/cups-pdf/

[5]https://bugs.launchpad.net/ubuntu/+source/cups-pdf/+bug/820820/+attachment/3878188/+files/04_add_pdf_passthrough_support.patch

[6]https://launchpadlibrarian.net/153781216/04_add_pdf_passthrough_support.patch

[7]https://bugs.launchpad.net/ubuntu/+source/cups-pdf/+bug/820820

[8]

--- a/extra/CUPS-PDF_noopt.ppd
+++ b/extra/CUPS-PDF_noopt.ppd
@@ -31,7 +31,8 @@
*ModelName:     "Generic CUPS-PDF Printer"
*ShortNickName: "Generic CUPS-PDF Printer"
*NickName:      "Generic CUPS-PDF Printer (no options)"
-*1284DeviceID:  "MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:POSTSCRIPT;"
+*1284DeviceID:  "MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:PDF,POSTSCRIPT;"
+*cupsFilter:    "application/pdf 0 -"
*% cupsFilter:    "application/vnd.cups-postscript 0 pstitleiconv"
*PSVersion: "(2017.000) 0"
*LanguageLevel: "2"
--- a/extra/CUPS-PDF_opt.ppd
+++ b/extra/CUPS-PDF_opt.ppd
@@ -31,7 +31,8 @@
*ModelName:     "Generic CUPS-PDF Printer"
*ShortNickName: "Generic CUPS-PDF Printer"
*NickName:      "Generic CUPS-PDF Printer (w/ options)"
-*1284DeviceID:  "MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:POSTSCRIPT;"
+*1284DeviceID:  "MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:PDF,POSTSCRIPT;"
+*cupsFilter:    "application/pdf 0 -"
*% cupsFilter:    "application/vnd.cups-postscript 0 pstitleiconv"
*PSVersion: "(2017.000) 0"
*LanguageLevel: "2"
--- a/src/cups-pdf.c
+++ b/src/cups-pdf.c
@@ -60,6 +60,7 @@
extern int errno;

static FILE *logfp=NULL;
+int input_is_pdf=0;

static void log_event(short type, char message[], char *detail) {
    time_t secs;
@@ -427,7 +427,7 @@
    int len;
    cp_string setup;

-  printf("file cups-pdf:/ \"Virtual PDF Printer\" \"CUPS-PDF\" \"MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:POSTSCRIPT;\"\n");
+  printf("file cups-pdf:/ \"Virtual PDF Printer\" \"CUPS-PDF\" \"MFG:Generic;MDL:CUPS-PDF Printer;DES:Generic CUPS-PDF Printer;CLS:PRINTER;CMD:PDF,POSTSCRIPT;\"\n");

    if ((dir = opendir(CP_CONFIG_PATH)) != NULL) {
        while ((config_ent = readdir(dir)) != NULL) {
@@ -460,6 +461,7 @@
    cp_string buffer;
    int rec_depth,is_title=0;
    FILE *fpdest;
+  size_t bytes = 0; 

    if (fpsrc == NULL) {
        log_event(CPERROR, "failed to open source stream", NULL);
@@ -483,14 +485,39 @@
        log_event(CPSTATUS, "***Experimental Option: FixNewlines");
    else
        log_event(CPDEBUG, "using traditional fgets");
-  while (fgets2(buffer, BUFSIZE, fpsrc) != NULL) {
+  while ((bytes = fread(buffer, sizeof(char), 4, fpsrc)) > 0) {
+    if (!strncmp(buffer, "%PDF", 4)) {
+      log_event(CPDEBUG, "found beginning of PDF code", buffer);
+      input_is_pdf=1;
+      break;
+    }
        if (!strncmp(buffer, "%!", 2) && strncmp(buffer, "%!PS-AdobeFont", 14)) {
            log_event(CPDEBUG, "found beginning of postscript code: %s", buffer);
            break;
        }
    }
-  log_event(CPDEBUG, "now extracting postscript code");
-  (void) fputs(buffer, fpdest);
+  log_event(CPDEBUG, "now extracting code");
+  fwrite(buffer, sizeof(char), bytes, fpdest);
+  if (input_is_pdf) {
+    while((bytes = fread(buffer, sizeof(char), BUFSIZE, fpsrc)) > 0)
+      fwrite(buffer, sizeof(char), bytes, fpdest);
+/* Commented out because decoding of utf16 PDF strings isn't implemented.
+    rewind(fpsrc);
+    while (fgets2(buffer, BUFSIZE, fpsrc) != NULL) {
+      if (!is_title) {
+        char *begin = strstr(buffer, "/Title");
+        if (begin) {
+          char *end = strstr(begin, ">");
+          if (end) {
+            strncpy(title, begin+6, BUFSIZE);
+            log_event(CPDEBUG, "found title in PDF code", title);
+            is_title=1;
+          }
+        }
+      }
+    }*/
+  }
+  else {
    while (fgets2(buffer, BUFSIZE, fpsrc) != NULL) {
        (void) fputs(buffer, fpdest);
        if (!is_title && !rec_depth)
@@ -513,6 +540,7 @@
            }
        }
    }
+  }
    (void) fclose(fpdest);
    (void) fclose(fpsrc);
    log_event(CPDEBUG, "all data written to spoolfile: %s", spoolfile);
@@ -761,7 +789,12 @@
            (void) fclose(logfp);
        return 5;
    }
-  snprintf(gscall, size, Conf_GSCall, Conf_GhostScript, Conf_PDFVer, outfile, spoolfile);
+  if (input_is_pdf) {
+    snprintf(gscall, size, "cp %s %s", spoolfile, outfile);
+  }
+  else {
+    snprintf(gscall, size, Conf_GSCall, Conf_GhostScript, Conf_PDFVer, outfile, spoolfile);
+  }
    log_event(CPDEBUG, "ghostscript commandline built: %s", gscall);

    (void) unlink(outfile);

Question 2

那么你也可以尝试这个方法。如何使用 Libreoffice 中的 PDF 导出功能？然后您可以打印或 cups-pdf 或从那时起您想要的任何内容。

Answer

那么你也可以尝试这个方法。如何使用 Libreoffice 中的 PDF 导出功能？然后您可以打印或 cups-pdf 或从那时起您想要的任何内容。

Question 3

我认为真正的问题不在 CUPS-PDF 中。即使在 Ghostscript 中也没有..花了几天时间试图理解真正的问题..为什么文本不可搜索？我想我找到了答案...

看看：这是我的工作流程：

我在云中有一台服务器，cups 和 cups-pdf 运行正常。
我在计算机中安装了 IPP 打印机 (xxx.com:631/printer/My-Cups-Pdf-Printer)
现在，我可以将任何应用程序中的任何文档打印到 My-Cups-Pdf-Printer，该文件将转换为 PDF 并存储在我的云服务器中。
很好......这将完美地工作。

现在奇怪的部分：

假设我想将 PDF 文件 (my-file.pdf)“重新打印”到 My-Cups-Pdf-Printer，只是为了利用后处理并在打印后执行一些任务...
我可以从 chrome、safari、preview.app 和 AdobeReader.app 打印 my-file.pdf
如果使用 Preview.app 、 chrome 、 safari 打印相同的文件 (my-file.pdf) 将生成 pdf 文件NOT_SEARCHABLE在 cups-pdf 中。
但如果使用 AdobeReader.app 打印同一个文件 (my-file.pdf) 将生成 pdf 文件可搜索在 cups-pdf 中。

问题是 AbobeReader.app（不是 chrome 扩展）将正确包含所有字体编码。而 safari、chrome、preview.app 等应用程序中操作系统使用的默认打印机系统将不包含此说明。结果将是一个 PDF，没有有关字体编码、位置的正确说明......

我认为问题可能出在 Ghostscript (gs GSCall) 中。但不是...问题不在 CUPS-PDF 中。即使在 Ghostscript 中..

问题在于操作系统打印机系统生成 PS 文件然后发送到 IPP 打印机 (My-Cups-Pdf-Printer) 的方式

有人认为这个问题可能是 Ghostscript 中的一个错误，但他也错了：https://bugs.ghostscript.com/show_bug.cgi?id=692450

我尝试使用 Ghostscript 从命令行手动将 PS 文件转换为 PDF，并得到相同的结果。来自 AdobeReader 的 PS 文件将创建可搜索文本。来自 chrome、safari、preview.app 的 PS 文件将创建一个NOT_SEARCHABLE文本。

MacOS 和 Windows 中也会出现同样的情况。在 AdobeReader 中打印的两个 PDF 文件都会创建可搜索最后是 Ghostscript 和 cups-pdf 中的 PDF。

其他人在 CUPS-PDF 中绕过 PDF 文件“修复”此问题的方法可以在另一个工作流程场景中工作（https://github.com/alexivkin/CUPS-PDF-to-PDF?tab=readme-ov-file) ，因为您只是在发送到 Ghostscript 之前绕过 cups-pdf 中的 PDF 文件。

所以事实上你不是在处理 PS（来自打印机的 postscript 文件）。在这种情况下，您正在处理一个已经很简单的 PDF 文件。

Answer