Tesseract OCR Engine 在 Ubuntu 上如何

Tesseract OCR Engine 在 Ubuntu 上如何

我已经安装了tesseract-ocr。我看了手册,但我没有看到可以定义图像边界(X、Y、W、H)的选项

有人可以帮忙吗,或者我问错了地方?

答案1

从命令行,您可以运行以下命令:

tesseract $imageFile $hocr hocr 使用 tesseract 3+ 版本。

$imageFile是输入,$hocr= HTML OCR 文件。

您可以解析文件(例如,使用 PHP)并为每个单词获取一个框。这将帮助您获取每个单词的坐标。

我们使用此方法制作可自定义搜索的 PDF...我们将文本放在页面上,然后覆盖原始图像。此过程也可以通过以下方式简化: man hocr2pdf


然后我们使用经典轻量级R&OS PDF使用自定义脚本动态构建 PDF 以旋转图像。我附上了更新后的函数(带有更新后的“流”函数以支持“恢复”...Accept-Ranges

class Mpdf extends Cezpdf 
{

function addPngFromFile($file,$x,$y,$w=0,$h=0,$angle=0)
{
  // read in a png file, interpret it, then add to the system
  $error=0;
  $tmp = get_magic_quotes_runtime();
  set_magic_quotes_runtime(0);
  $fp = @fopen($file,'rb');
  if ($fp){
    $data='';
    while(!feof($fp)){
      $data .= fread($fp,1024);
    }
    fclose($fp);
  } else {
    $error = 1;
    $errormsg = 'trouble opening file: '.$file;
  }
  set_magic_quotes_runtime($tmp);

  if (!$error){
    $header = chr(137).chr(80).chr(78).chr(71).chr(13).chr(10).chr(26).chr(10);
    if (substr($data,0,8)!=$header){
      $error=1;
      $errormsg = 'this file does not have a valid header';
    }
  }

  if (!$error){
    // set pointer
    $p = 8;
    $len = strlen($data);
    // cycle through the file, identifying chunks
    $haveHeader=0;
    $info=array();
    $idata='';
    $pdata='';
    while ($p<$len){
      $chunkLen = $this->PRVT_getBytes($data,$p,4);
      $chunkType = substr($data,$p+4,4);

      switch($chunkType){
    case 'IHDR':
      // this is where all the file information comes from
      $info['width']=$this->PRVT_getBytes($data,$p+8,4);
      $info['height']=$this->PRVT_getBytes($data,$p+12,4);
      $info['bitDepth']=ord($data[$p+16]);
      $info['colorType']=ord($data[$p+17]);
      $info['compressionMethod']=ord($data[$p+18]);
      $info['filterMethod']=ord($data[$p+19]);
      $info['interlaceMethod']=ord($data[$p+20]);
      $haveHeader=1;
      if ($info['compressionMethod']!=0){
        $error=1;
        $errormsg = 'unsupported compression method';
      }
      if ($info['filterMethod']!=0){
        $error=1;
        $errormsg = 'unsupported filter method';
      }
      break;
    case 'PLTE':
      $pdata.=substr($data,$p+8,$chunkLen);
      break;
    case 'IDAT':
      $idata.=substr($data,$p+8,$chunkLen);
      break;
    case 'tRNS': 
      //this chunk can only occur once and it must occur after the PLTE chunk and before IDAT chunk 
      //print "tRNS found, color type = ".$info['colorType']."<BR>"; 
      $transparency = array();
      if ($info['colorType'] == 3) { // indexed color, rbg 
      /* corresponding to entries in the plte chunk 
      Alpha for palette index 0: 1 byte 
      Alpha for palette index 1: 1 byte 
      ...etc... 
      */ 
        // there will be one entry for each palette entry. up until the last non-opaque entry.
        // set up an array, stretching over all palette entries which will be o (opaque) or 1 (transparent)
        $transparency['type']='indexed';
        $numPalette = strlen($pdata)/3;
        $trans=0;
        for ($i=$chunkLen;$i>=0;$i--){
          if (ord($data[$p+8+$i])==0){
        $trans=$i;
          }
        }
        $transparency['data'] = $trans;

      } elseif($info['colorType'] == 0) { // grayscale 
      /* corresponding to entries in the plte chunk 
      Gray: 2 bytes, range 0 .. (2^bitdepth)-1 
      */ 
        $transparency['type']='indexed';
        $transparency['data'] = ord($data[$p+8+1]);

      } elseif($info['colorType'] == 2) { // truecolor 
      /* corresponding to entries in the plte chunk 
      Red: 2 bytes, range 0 .. (2^bitdepth)-1 
      Green: 2 bytes, range 0 .. (2^bitdepth)-1 
      Blue: 2 bytes, range 0 .. (2^bitdepth)-1 
      */ 
        $transparency['r']=$this->PRVT_getBytes($data,$p+8,2); // r from truecolor 
        $transparency['g']=$this->PRVT_getBytes($data,$p+10,2); // g from truecolor 
        $transparency['b']=$this->PRVT_getBytes($data,$p+12,2); // b from truecolor 

      } else { 
      //unsupported transparency type 
      } 
      // KS End new code 
      break; 
    default:
      break;
      }

      $p += $chunkLen+12;
    }

    if(!$haveHeader){
      $error = 1;
      $errormsg = 'information header is missing';
    }
    if (isset($info['interlaceMethod']) && $info['interlaceMethod']){
      $error = 1;
      $errormsg = 'There appears to be no support for interlaced images in pdf.';
    }
  }

  if (!$error && $info['bitDepth'] > 8){
    $error = 1;
    $errormsg = 'only bit depth of 8 or less is supported';
  }

  if (!$error){
    if ($info['colorType']!=2 && $info['colorType']!=0 && $info['colorType']!=3){
      $error = 1;
      $errormsg = 'transparancey alpha channel not supported, transparency only supported for palette images.';
    } else {
      switch ($info['colorType']){
    case 3:
      $color = 'DeviceRGB';
      $ncolor=1;
      break;
    case 2:
      $color = 'DeviceRGB';
      $ncolor=3;
      break;
    case 0:
      $color = 'DeviceGray';
      $ncolor=1;
      break;
      }
    }
  }
  if ($error){
    $this->addMessage('PNG error - ('.$file.') '.$errormsg);
    return;
  }
  if ($w==0){
    $w=$h/$info['height']*$info['width'];
  }
  if ($h==0){
    $h=$w*$info['height']/$info['width'];
  }

  // so this image is ok... add it in.
  $this->numImages++;
  $im=$this->numImages;
  $label='I'.$im;
  $this->numObj++;
  $options = array('label'=>$label,'data'=>$idata,'bitsPerComponent'=>$info['bitDepth'],'pdata'=>$pdata
                      ,'iw'=>$info['width'],'ih'=>$info['height'],'type'=>'png','color'=>$color,'ncolor'=>$ncolor);
  if (isset($transparency)){
    $options['transparency']=$transparency;
  }

  $this->o_image($this->numObj,'new',$options);

    # $angle in degrees
    $this->objects[$this->currentContents]['c'].="\nq";
    $this->objects[$this->currentContents]['c'].="\n".sprintf('%.3f',$w)." 0 0 ".sprintf('%.3f',$h)." ".sprintf('%.3f',$x)." ".sprintf('%.3f',$y)." cm";

            {
            $this->objects[$this->currentContents]['c'].="\n"."%%angle $angle ";

            $a = deg2rad((float)$angle);
                $this->objects[$this->currentContents]['c'].="\n".sprintf('%.6f',cos($a))." ".sprintf('%.6f',sin($a))." ".sprintf('%.6f',-1*sin($a))." ".sprintf('%.6f',cos($a))." 0 0 cm";
            }

    $this->objects[$this->currentContents]['c'].="\n/".$label.' Do';
    $this->objects[$this->currentContents]['c'].="\nQ";


}



function stream($options=''){
  // setting the options allows the adjustment of the headers
  // values at the moment are:
  // 'Content-Disposition'=>'filename'  - sets the filename, though not too sure how well this will 
  //        work as in my trial the browser seems to use the filename of the php file with .pdf on the end
  // 'Accept-Ranges'=>1 or 0 - if this is not set to 1, then this header is not included, off by default
  //    this header seems to have caused some problems despite tha fact that it is supposed to solve
  //    them, so I am leaving it off by default.
  // 'compress'=> 1 or 0 - apply content stream compression, this is on (1) by default
  if (!is_array($options)){
    $options=array();
  }
  if ( isset($options['compress']) && $options['compress']==0){
    $tmp = $this->output(1);
  } else {
    $tmp = $this->output();
  }
  header("Content-type: application/pdf");
  header("Content-Length: ".strlen(ltrim($tmp)));
  $fileName = (isset($options['Content-Disposition'])?$options['Content-Disposition']:'file.pdf');
  header("Content-Disposition: inline; filename=".$fileName);
  if (isset($options['Accept-Ranges']) && $options['Accept-Ranges']==1){
    header("Accept-Ranges: ".strlen(ltrim($tmp))); 
  }
  echo ltrim($tmp);
}

}

相关内容