tesseract $imageFile $hocr hocr
使用 tesseract 3+ 版本。
= HTML OCR 文件。
您可以解析文件(例如,使用 PHP)并为每个单词获取一个框。这将帮助您获取每个单词的坐标。
我们使用此方法制作可自定义搜索的 PDF...我们将文本放在页面上,然后覆盖原始图像。此过程也可以通过以下方式简化: man hocr2pdf
然后我们使用经典轻量级R&OS PDF使用自定义脚本动态构建 PDF 以旋转图像。我附上了更新后的函数(带有更新后的“流”函数以支持“恢复”...Accept-Ranges
class Mpdf extends Cezpdf
function addPngFromFile($file,$x,$y,$w=0,$h=0,$angle=0)
// read in a png file, interpret it, then add to the system
$tmp = get_magic_quotes_runtime();
$fp = @fopen($file,'rb');
if ($fp){
$data .= fread($fp,1024);
} else {
$error = 1;
$errormsg = 'trouble opening file: '.$file;
if (!$error){
$header = chr(137).chr(80).chr(78).chr(71).chr(13).chr(10).chr(26).chr(10);
if (substr($data,0,8)!=$header){
$errormsg = 'this file does not have a valid header';
if (!$error){
// set pointer
$p = 8;
$len = strlen($data);
// cycle through the file, identifying chunks
while ($p<$len){
$chunkLen = $this->PRVT_getBytes($data,$p,4);
$chunkType = substr($data,$p+4,4);
case 'IHDR':
// this is where all the file information comes from
if ($info['compressionMethod']!=0){
$errormsg = 'unsupported compression method';
if ($info['filterMethod']!=0){
$errormsg = 'unsupported filter method';
case 'PLTE':
case 'IDAT':
case 'tRNS':
//this chunk can only occur once and it must occur after the PLTE chunk and before IDAT chunk
//print "tRNS found, color type = ".$info['colorType']."<BR>";
$transparency = array();
if ($info['colorType'] == 3) { // indexed color, rbg
/* corresponding to entries in the plte chunk
Alpha for palette index 0: 1 byte
Alpha for palette index 1: 1 byte
// there will be one entry for each palette entry. up until the last non-opaque entry.
// set up an array, stretching over all palette entries which will be o (opaque) or 1 (transparent)
$numPalette = strlen($pdata)/3;
for ($i=$chunkLen;$i>=0;$i--){
if (ord($data[$p+8+$i])==0){
$transparency['data'] = $trans;
} elseif($info['colorType'] == 0) { // grayscale
/* corresponding to entries in the plte chunk
Gray: 2 bytes, range 0 .. (2^bitdepth)-1
$transparency['data'] = ord($data[$p+8+1]);
} elseif($info['colorType'] == 2) { // truecolor
/* corresponding to entries in the plte chunk
Red: 2 bytes, range 0 .. (2^bitdepth)-1
Green: 2 bytes, range 0 .. (2^bitdepth)-1
Blue: 2 bytes, range 0 .. (2^bitdepth)-1
$transparency['r']=$this->PRVT_getBytes($data,$p+8,2); // r from truecolor
$transparency['g']=$this->PRVT_getBytes($data,$p+10,2); // g from truecolor
$transparency['b']=$this->PRVT_getBytes($data,$p+12,2); // b from truecolor
} else {
//unsupported transparency type
// KS End new code
$p += $chunkLen+12;
$error = 1;
$errormsg = 'information header is missing';
if (isset($info['interlaceMethod']) && $info['interlaceMethod']){
$error = 1;
$errormsg = 'There appears to be no support for interlaced images in pdf.';
if (!$error && $info['bitDepth'] > 8){
$error = 1;
$errormsg = 'only bit depth of 8 or less is supported';
if (!$error){
if ($info['colorType']!=2 && $info['colorType']!=0 && $info['colorType']!=3){
$error = 1;
$errormsg = 'transparancey alpha channel not supported, transparency only supported for palette images.';
} else {
switch ($info['colorType']){
case 3:
$color = 'DeviceRGB';
case 2:
$color = 'DeviceRGB';
case 0:
$color = 'DeviceGray';
if ($error){
$this->addMessage('PNG error - ('.$file.') '.$errormsg);
if ($w==0){
if ($h==0){
// so this image is ok... add it in.
$options = array('label'=>$label,'data'=>$idata,'bitsPerComponent'=>$info['bitDepth'],'pdata'=>$pdata
if (isset($transparency)){
# $angle in degrees
$this->objects[$this->currentContents]['c'].="\n".sprintf('%.3f',$w)." 0 0 ".sprintf('%.3f',$h)." ".sprintf('%.3f',$x)." ".sprintf('%.3f',$y)." cm";
$this->objects[$this->currentContents]['c'].="\n"."%%angle $angle ";
$a = deg2rad((float)$angle);
$this->objects[$this->currentContents]['c'].="\n".sprintf('%.6f',cos($a))." ".sprintf('%.6f',sin($a))." ".sprintf('%.6f',-1*sin($a))." ".sprintf('%.6f',cos($a))." 0 0 cm";
$this->objects[$this->currentContents]['c'].="\n/".$label.' Do';
function stream($options=''){
// setting the options allows the adjustment of the headers
// values at the moment are:
// 'Content-Disposition'=>'filename' - sets the filename, though not too sure how well this will
// work as in my trial the browser seems to use the filename of the php file with .pdf on the end
// 'Accept-Ranges'=>1 or 0 - if this is not set to 1, then this header is not included, off by default
// this header seems to have caused some problems despite tha fact that it is supposed to solve
// them, so I am leaving it off by default.
// 'compress'=> 1 or 0 - apply content stream compression, this is on (1) by default
if (!is_array($options)){
if ( isset($options['compress']) && $options['compress']==0){
$tmp = $this->output(1);
} else {
$tmp = $this->output();
header("Content-type: application/pdf");
header("Content-Length: ".strlen(ltrim($tmp)));
$fileName = (isset($options['Content-Disposition'])?$options['Content-Disposition']:'file.pdf');
header("Content-Disposition: inline; filename=".$fileName);
if (isset($options['Accept-Ranges']) && $options['Accept-Ranges']==1){
header("Accept-Ranges: ".strlen(ltrim($tmp)));
echo ltrim($tmp);