All Downloads are FREE. Search and download functionalities are using the official Maven repository.

srcnativelibs.Vision.tessocr.cpp Maven / Gradle / Ivy

/*
 * Copyright 2010-2014, Sikuli.org, sikulix.com
 * Released under the MIT License.
 *
 */
#include 
#include 
#include 
#include 
#include 
#include 
#include "tessocr.h"
#include "sikuli-debug.h"

using namespace std;
using namespace sikuli;
using namespace tesseract;


TessBaseAPI OCR::_tessAPI;

#define COMPUTE_IMAGE_XDIM(xsize,bpp) ((bpp)>8 ? ((xsize)*(bpp)+7)/8 :((xsize)+8/(bpp)-1)/(8/(bpp)))
char* OCR::getBoxText(const unsigned char* imagedata,
                         int width, int height, int bpp){

   int bytes_per_pixel = bpp / 8;
   int bytes_per_line = COMPUTE_IMAGE_XDIM(width,bpp);
   _tessAPI.SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
   _tessAPI.Recognize(0);
   char *boxtext = _tessAPI.GetBoxText(0);

   /*
   char* text = TessBaseAPI::TesseractRectBoxes(imagedata,
                                                bytes_per_pixel,
                                                bytes_per_line, 0, 0,
                                                width,
                                                height,
                                                height);
                                                */
   return boxtext;
}

char* OCR::getText(const unsigned char* imagedata,
                         int width, int height, int bpp){

   int bytes_per_pixel = bpp / 8;
   int bytes_per_line = COMPUTE_IMAGE_XDIM(width,bpp);
   _tessAPI.SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
   _tessAPI.Recognize(0);
   char *text = _tessAPI.GetUTF8Text();
   return text;
}


OCRRect::OCRRect(int x_, int y_, int width_, int height_)
: x(x_), y(y_), width(width_), height(height_){};

OCRRect::OCRRect(){
   x = -1;
   y = -1;
   width = -1;
   height = -1;
}

void
OCRRect::addOCRRect(const OCRRect& rect){
   if (width < 0 && height < 0){
      x = rect.x;
      y = rect.y;
      height = rect.height;
      width = rect.width;
   }else{
      int left = x < rect.x ? x : rect.x;
      int top = y < rect.y ? y : rect.y;
      int lhs = x + width;
      int rhs = rect.x + rect.width;
      int right = lhs > rhs ? lhs : rhs;
      lhs = y + height;
      rhs = rect.y + rect.height;
      int bottom = lhs > rhs ? lhs : rhs;
      x = left; y = top; width = right - left; height = bottom - top;
   }
}


void
OCRWord::add(const OCRChar& ocr_char){
   addOCRRect(ocr_char);
   ocr_chars_.push_back(ocr_char);
}

string
OCRWord::str(){
   string ret = "";
   for (vector::iterator it = ocr_chars_.begin(); it != ocr_chars_.end(); ++it){
      ret = ret + it->ch;
   }
   return ret;
}

vector
OCRWord::getChars(){
   return ocr_chars_;
}

string
OCRWord::getString(){
   return str();
}

void
OCRWord::clear() {
   width = -1; height = -1;
   ocr_chars_.clear();
};

bool
OCRWord::isValidWord(){
   return OCR::_tessAPI.IsValidWord(str().c_str());
}

void
OCRLine::addWord(OCRWord& ocr_word){
   addOCRRect(ocr_word);
   ocr_words_.push_back(ocr_word);
}

vector
OCRLine::getWords(){
   return ocr_words_;
}

string
OCRLine::getString(){
   if (ocr_words_.empty())
      return string("");

   string ret;
   ret = ocr_words_.front().getString();
   for (vector::iterator it = ocr_words_.begin()+1;
        it != ocr_words_.end(); ++it){
      OCRWord& word = *it;
      ret = ret + " " + word.getString();
   }
   return ret;
}

void
OCRParagraph::addLine(OCRLine& ocr_line){
   addOCRRect(ocr_line);
   ocr_lines_.push_back(ocr_line);
}

vector
OCRParagraph::getLines(){
   return ocr_lines_;
}

//void
//OCRText::add(OCRWord& ocr_word){
//   ocr_words_.push_back(ocr_word);
//}
//
//void
//OCRText::addLine(OCRLine& ocr_line){
//   ocr_lines_.push_back(ocr_line);
//}


void
OCRText::save(const char* filename){
// TODO: reimplement

//
//   ofstream of(filename);
//
//   for (iterator it = begin();
//        it != end(); ++it){
//
//      of << it->str() << " ";
//   }
//
//   of.close();
}

void
OCRText::save_with_location(const char* filename){


   vector words = getWords();

   ofstream of(filename);

   for (vector::iterator it = words.begin();
        it != words.end(); ++it){

      OCRWord& w = *it;

      of << w.x << " " << w.y << " " << w.width << " " << w.height << " ";
      of << w.getString() << " ";
      of << endl;
   }

   of.close();
}

void
OCRText::addParagraph(OCRParagraph& ocr_paragraph){
   addOCRRect(ocr_paragraph);
   ocr_paragraphs_.push_back(ocr_paragraph);
}

vector
OCRText::getLineStrings(){
   vector line_strings;

   for (vector::iterator it = ocr_paragraphs_.begin();
        it != ocr_paragraphs_.end(); ++it){

      OCRParagraph& para = *it;

      for (vector::iterator it1 = para.getLines().begin();
           it1 != para.getLines().end(); ++it1){

         OCRLine& line = *it1;

         string line_string = line.getString();

         line_strings.push_back(line_string);

      }
   }

   return line_strings;
}


vector
OCRText::getWords(){
   vector ret_words;

   for (vector::iterator it = ocr_paragraphs_.begin();
        it != ocr_paragraphs_.end(); ++it){

      vector lines = it->getLines();

      for (vector::iterator it1 = lines.begin();
           it1 != lines.end(); ++it1){

         vector words = it1->getWords();

         for (vector::iterator it2 = words.begin();
              it2 != words.end(); ++it2){

            OCRWord word = *it2;
            ret_words.push_back(word);
         }
      }
   }

   return ret_words;
}

vector
OCRText::getParagraphs(){
   return ocr_paragraphs_;
}

vector
OCRText::getWordStrings(){
   vector word_strings;

   for (vector::iterator it = ocr_paragraphs_.begin();
        it != ocr_paragraphs_.end(); ++it){

      vector lines = it->getLines();

      for (vector::iterator it1 = lines.begin();
           it1 != lines.end(); ++it1){

         vector words = it1->getWords();

         for (vector::iterator it2 = words.begin();
              it2 != words.end(); ++it2){

            OCRWord& word = *it2;
            word_strings.push_back(word.getString());
         }

         // add new line
         word_strings.push_back("\n");
      }
   }

   return word_strings;
}

string
OCRText::getString(){
   vector word_strings;
   word_strings = getWordStrings();

   if (word_strings.empty())
      return "";


   string ret = word_strings.front();

   for (vector::iterator it = word_strings.begin() + 1;
        it != word_strings.end(); ++it){

      ret = ret + *it + " ";
   }

   return ret;
}


char
encode(char ch){
   char code;
   if (ch >= '0' && ch <= '9')
      code = ch - '0' + 2;
   else if (ch >= 'a' && ch <= 'z')
      code = ch - 'a' + 12;
   else if (ch >= 'A' && ch <= 'Z')
      code = ch - 'A' + 12;
   else
      code = 0;
   return code;
}


// produce a new image 200% the size of the given image
unsigned char* x2(const unsigned char* imagedata,
                  int width, int height, int bpp){

   int bytes_per_pixel = bpp / 8;

   unsigned char* newimage = new unsigned char[width*height*4];

   const unsigned char* p = imagedata;
   unsigned char* q = newimage;

   for (int k=0;k(env_datapath.c_str()));
#else
   //putenv on Mac breaks the "open" command somehow.
   //we have to use setenv instead.
   setenv("TESSDATA_PREFIX", datapath, 1);
#endif
   int ret = _tessAPI.Init(datapath, _lang.c_str());
	 //TODO
	 //int ret = _tessAPI.Init(datapath, _lang.c_str(), OEM_TESSERACT_ONLY);
	 //   _tessAPI.SetAccuracyVSpeed(AVS_MOST_ACCURATE); // FIXME: doesn't work?
   isInitialized = true;
}





#include "cvgui.h"
using namespace cv;

#define MAXLEN 80

static int findMin(int d1, int d2, int d3) {
   /*
    * return min of d1, d2 and d3.
    */
   if(d1 < d2 && d1 < d3)
      return d1;
   else if(d1 < d3)
      return d2;
   else if(d2 < d3)
      return d2;
   else
      return d3;
}

static int
findEditDistanceLessThanK(const char *s1, const char *s2,
                                    int k){
   /*
    * returns edit distance between s1 and s2.
    */
   int d1, d2, d3;

   if(*s1 == 0)
      return strlen(s2);
   if(*s2 == 0)
      return strlen(s1);
   if (k == 0)
      return 0;
   if(*s1 == *s2)
      d1 = findEditDistanceLessThanK(s1+1, s2+1, k);
   else
      d1 = 1 + findEditDistanceLessThanK(s1+1, s2+1, k-1);    // update.
   d2 = 1+findEditDistanceLessThanK(s1, s2+1, k-1);                   // insert.
   d3 = 1+findEditDistanceLessThanK(s1+1, s2, k-1);                   // delete.

   return findMin(d1, d2, d3);
}

static int findEditDistance(const char *s1, const char *s2) {
   /*
    * returns edit distance between s1 and s2.
    */
   int d1, d2, d3;

   if(*s1 == 0)
      return strlen(s2);
   if(*s2 == 0)
      return strlen(s1);
   if(*s1 == *s2)
      d1 = findEditDistance(s1+1, s2+1);
   else
      d1 = 1 + findEditDistance(s1+1, s2+1);    // update.
   d2 = 1+findEditDistance(s1, s2+1);                   // insert.
   d3 = 1+findEditDistance(s1+1, s2);                   // delete.

   return findMin(d1, d2, d3);
}

void sharpen(Mat& img){
   Mat blur;
   GaussianBlur(img, blur, cv::Size(0, 0), 5);
   addWeighted(img, 2.5, blur, -1.5, 0, img);
}

float preprocess_for_ocr(const Mat& in_img, Mat& out_img){
   const float MIN_HEIGHT = 30;
   float scale = 1.f;
   if (in_img.rows < MIN_HEIGHT){
      scale = MIN_HEIGHT / float(in_img.rows);
      resize(in_img, out_img, Size(in_img.cols*scale,in_img.rows*scale));
			//TODO
			//resize(in_img, out_img, Size(in_img.cols*scale,in_img.rows*scale), 0, 0, INTER_CUBIC);
			//copyMakeBorder (in_img, out_img, 0, (scale-1)*in_img.rows, 0, (scale-1)*in_img.cols, BORDER_REPLICATE);
   }else {
      out_img = in_img;
   }
   sharpen(out_img);
   //imshow("ocrImage", out_img);
   return scale;
}

string OCR::recognize_as_string(const Mat& blobImage){
   Mat gray, ocrImage;  // the image passed to tesseract
   OCR::init();
   cvtColor(blobImage, gray, CV_RGB2GRAY);
   preprocess_for_ocr(gray, ocrImage);

   //imshow("ocr", ocrImage); waitKey();
   char* text = getText((unsigned char*)ocrImage.data,
                              ocrImage.cols,
                              ocrImage.rows,
                              8);
   if(text){
      string ret = string(text);
      delete [] text;
      return ret;
   }
   return "";
}

vector getWordsFromImage(const Mat& screen, const Blob& blob){

   Mat blobImage(screen,blob);

   Mat ocrImage;  // the image passed to tesseract
   float scale = preprocess_for_ocr(blobImage, ocrImage);


   vector ocr_words;
   ocr_words = OCR::recognize_to_words((unsigned char*)ocrImage.data,
                              ocrImage.cols,
                              ocrImage.rows,
                              8);

   for (vector::iterator iter = ocr_words.begin();
        iter != ocr_words.end(); iter++){
      OCRWord& word = *iter;
      if(scale>1.f){
         // scale back the coordinates in the OCR result
         word.x = word.x/scale;
         word.y = word.y/scale;
         word.width = word.width/scale;
         word.height = word.height/scale;
      }

      word.x += blob.x;
      word.y += blob.y;
   }

   return ocr_words;
}



vector run_ocr(const Mat& screen, const Blob& blob){

   Mat blobImage(screen,blob);

   Mat ocrImage;  // the image passed to tesseract
   float scale = preprocess_for_ocr(blobImage, ocrImage);

   vector ocr_chars;
   ocr_chars = OCR::recognize((unsigned char*)ocrImage.data,
                              ocrImage.cols,
                              ocrImage.rows,
                              8);

   for (vector::iterator iter = ocr_chars.begin();
        iter != ocr_chars.end(); iter++){
      OCRChar& ocrchar = *iter;
      if(scale>1.f){
         // scale back the coordinates in the OCR result
         ocrchar.x = ocrchar.x/scale;
         ocrchar.y = ocrchar.y/scale;
         ocrchar.width = ocrchar.width/scale;
         ocrchar.height = ocrchar.height/scale;
      }

      ocrchar.x += blob.x;
      ocrchar.y += blob.y;

   }

   return ocr_chars;
}


void
find_phrase_helper(const Mat& screen_gray, vector words, vector lineblobs,
                   LineBlob resultblob, vector& results, bool is_find_one = true){

   string word = words[0];

   vector rest;
   for (vector::iterator it2 = words.begin()+1;
        it2 != words.end(); ++ it2)
      rest.push_back(*it2);

   dhead("find_phrase") << "<" << word << ">" << endl;

   vector lineblobs_thisround = lineblobs;
   for (int r = 0; r < 3; ++r){


      for (int tolerance = 0; tolerance < 3; ++tolerance){

         vector lineblobs_nextround;

         for (vector::iterator it = lineblobs_thisround.begin();
              it != lineblobs_thisround.end(); ++it){

            LineBlob lineblob = *it;


            if (abs((int)lineblob.blobs.size() - (int)word.size()) > tolerance){
               lineblobs_nextround.push_back(lineblob);
               continue;
            }

            dhead("find_phrase") << lineblob.x << "," << lineblob.y << "," << lineblob.width << "," << lineblob.height << endl;

            vector ocr_chars = run_ocr(screen_gray, lineblob);
            dhead("find_phrase") << word << "<->";

            string ocrword = "";
            for (vector::iterator iter = ocr_chars.begin();
                 iter != ocr_chars.end(); iter++){

               OCRChar& ocrchar = *iter;
               dout("find_phrase") << ocrchar.ch;

               ocrword = ocrword + ocrchar.ch;
            }

            if (ocr_chars.size() < 1){
               dout("find_phrase") << endl;
               continue;
            }

            int d = findEditDistanceLessThanK(word.c_str(), ocrword.c_str(),3);


            dout("find_phrase") << '[' << d << ']';

            if (d > 2){
               dout("find_phrase") << endl;
               lineblobs_nextround.push_back(lineblob);
               continue;
            }


            if (rest.empty()){
               dout("find_phrase") << " ... match!" << endl;

               //Blob b = resultblob;
               //dout("find_phrase") << b.x << "," << b.y << endl;
               //b = lineblob;
               //dout("find_phrase") << b.x << "," << b.y << endl;

               resultblob.merge(lineblob);

               FindResult result(resultblob.x,resultblob.y,
                                 resultblob.width,resultblob.height, 1.0);

               results.push_back(result);
               return;

            }
            else
               dout("find_phrase") << endl;




            vector nextblobs;
            for (vector::iterator it2 = lineblobs.begin();
                 it2 != lineblobs.end(); ++it2){

               LineBlob& b1 = lineblob;
               LineBlob& b2 = *it2;

               bool similar_baseline = abs((b1.y + b1.height) - (b2.y + b2.height)) < 5;
               bool close_right = (b2.x > b1.x) && (b2.x - (b1.x+b1.width)) < 20;
               bool close_below = (b2.y > b1.y) && (b2.y - b1.y) < 20;

               if (close_right && similar_baseline)
                  nextblobs.push_back(b2);

            }



            if (!rest.empty() && !nextblobs.empty()){

               LineBlob next_resultblob = resultblob;
               next_resultblob.merge(lineblob);

               find_phrase_helper(screen_gray, rest, nextblobs, next_resultblob, results, is_find_one);
            }



            dout("find_phrase") << endl;

            // check if we have already found one match
            if (is_find_one && results.size() >= 1)
               // if so, we return the reuslts right away
               return;

         }


         lineblobs_thisround = lineblobs_nextround;

      }



   }
}


int
OCR::findEditDistance(const char *s1, const char *s2,
                      int k){

   return findEditDistanceLessThanK(s1,s2,k);
}

vector
OCR::find_phrase(const Mat& screen, vector words, bool is_find_one){

   vector lineblobs;
   cvgui::getLineBlobsAsIndividualWords(screen, lineblobs);

   Mat screen_gray;
   cvtColor(screen,screen_gray,CV_RGB2GRAY);

   vector results;

   LineBlob empty;
   find_phrase_helper(screen_gray, words, lineblobs, empty, results, is_find_one);

   return results;
}

vector
OCR::find_word(const Mat& screenshot, string word, bool is_find_one){

   vector words;
   words.push_back(word);

   return find_phrase(screenshot, words, is_find_one);
}

OCRText
OCR::recognize_screenshot(const char* screenshot_filename){

   Mat screenshot = imread(screenshot_filename, 1);
   return recognize(screenshot);
}



OCRLine
linkOCRCharsToOCRLine(const vector& ocrchars){

   OCRLine ocrline;
   OCRWord ocrword;

   int previous_spacing = 1000;
   int next_spacing = 1000;
   for (vector::const_iterator it = ocrchars.begin();
        it != ocrchars.end(); it++){

      const OCRChar& ocrchar = *it;

      int current_spacing = 0;
      if (it > ocrchars.begin()){
         const OCRChar& previous_ocrchar = *(it-1);

         current_spacing = ocrchar.x - (previous_ocrchar.x + previous_ocrchar.width);
         //cout << '[' << ocrchar.height << ':' << spacing << ']';
         //cout << '[' << spacing << ']';
      }


      if (it < ocrchars.end() - 1){
         const OCRChar& next_ocrchar = *(it+1);
         next_spacing = next_ocrchar.x - (ocrchar.x + ocrchar.width);

//         if (current_spacing > next_spacing + 1){// || spacing >= 2){
//            ocrline.addWord(ocrword);
//            ocrword.clear();
//            //cout << ' ';
//         }

      }

      if (current_spacing > previous_spacing + 2 ||
          current_spacing > next_spacing + 2){
         ocrline.addWord(ocrword);
         ocrword.clear();
         //cout << ' ';
      }


      previous_spacing = current_spacing;

      ocrword.add(ocrchar);
      //cout << ocrchar.ch;
   }

   if (!ocrword.empty())
      ocrline.addWord(ocrword);

   return ocrline;
}

OCRLine
recognize_line(const cv::Mat& screen_gray, const LineBlob& lineblob){

   Blob b(lineblob);

   vector words = getWordsFromImage(screen_gray, lineblob);
   OCRLine line;
   for(vector::iterator it = words.begin(); it != words.end(); ++it)
      line.addWord(*it);
   return line;
}

/*
OCRLine
recognize_line(const cv::Mat& screen_gray, const LineBlob& lineblob){

   Blob b(lineblob);
   //Util::growRect(b, 2, 2, screen_gray);

   vector ocrchars = run_ocr(screen_gray, b);
   OCRLine ocrline = linkOCRCharsToOCRLine(ocrchars);
   return ocrline;
}
*/


OCRParagraph
recognize_paragraph(const cv::Mat& screen_gray, const ParagraphBlob& parablob){

   OCRParagraph ocrparagraph;

   for (vector::const_iterator it = parablob.begin();
        it != parablob.end(); ++it){

      const LineBlob& lineblob = *it;
      OCRLine ocrline = recognize_line(screen_gray, lineblob);


      if (!ocrline.getWords().empty())
         ocrparagraph.addLine(ocrline);
   }

   return ocrparagraph;
}

OCRText
OCR::recognize(cv::Mat screen){

   OCRText ocrtext;


   vector parablobs;
   cvgui::getParagraphBlobs(screen, parablobs);

   Mat screen_gray;
   if(screen.channels()>1)
      cvtColor(screen,screen_gray,CV_RGB2GRAY);
   else
      screen_gray = screen;


   for (vector::iterator it = parablobs.begin();
        it != parablobs.end(); ++it){

      ParagraphBlob& parablob = *it;

      OCRParagraph ocrpara;
      ocrpara = recognize_paragraph(screen_gray, parablob);
      ocrtext.addParagraph(ocrpara);

   }
   //TODO: VISUAL LOGGING
   //Mat dark = screen * 0.2;
   //Painter::drawOCRText(dark, ocrtext);
   //VLOG("OCR-result", dark);


   return ocrtext;
}



vector
OCR::recognize(const unsigned char* imagedata,
               int width, int height, int bpp){

   OCR::init();

   vector ret;

   char* boxtext = getBoxText(imagedata,width,height,bpp);

   //Result ocr_result;

   if (boxtext){

      stringstream str(boxtext);
      string ch;
      int x0,y0,x1,y1, page;
      while (str >> ch >> x0 >> y0 >> x1 >> y1 >> page){
         //cout << ch << " " << x0 << " " << y0 << " " << x1 << " " << y1 << endl;

         //convert back to the screen coordinate (0,0) - (left,top)
         int h = y1 - y0;
         int w = x1 - x0;
         OCRChar ocr_char(ch,x0,height-y1,w,h);

         ret.push_back(ocr_char);
      };


      delete [] boxtext;
   }



   return ret;
}



vector
OCR::recognize_to_words(const unsigned char* imagedata,
                        int width, int height, int bpp){

   OCR::init();

   vector ret;
   vector chars = OCR::recognize(imagedata, width, height, bpp);
   char *text = _tessAPI.GetUTF8Text();
   //cout << "chars: " << chars.size() << endl;
   //cout << "UTF8Text: [" << text << "]\n";
   int *scores = _tessAPI.AllWordConfidences();
   char *p_ch = text;
   OCRWord word;

   for(vector::iterator it = chars.begin(); it != chars.end(); ){
      int len = it->ch.length();
      if(*p_ch != ' ' && *p_ch != '\n'){
         word.add(*it);
         ++it;
      }
      else{
         if(!word.empty()){
            //cout << "add " << word.str() << endl;
            ret.push_back(word);
            word.clear();
         }
      }
      p_ch += len;
   }
   if(!word.empty())
      ret.push_back(word);
   int i;
   for(i=0;i= 0;i++){
      ret[i].score = scores[i]/100.f;
      //cout << ret[i].str() << " " << ret[i].score << endl;
   }
   while(scores[i]>=0) i++;
   if(ret.size() != i){
   //   cerr << "WARNING: num of words not consistent!: "
   //        << "#WORDS: " << ret.size() <<  " " << i << endl;
   }
   return ret;
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy