
srcnativelibs.Vision.tessocr.cpp Maven / Gradle / Ivy
/*
* Copyright 2010-2014, Sikuli.org, sikulix.com
* Released under the MIT License.
*
*/
#include
#include
#include
#include
#include
#include
#include "tessocr.h"
#include "sikuli-debug.h"
using namespace std;
using namespace sikuli;
using namespace tesseract;
TessBaseAPI OCR::_tessAPI;
#define COMPUTE_IMAGE_XDIM(xsize,bpp) ((bpp)>8 ? ((xsize)*(bpp)+7)/8 :((xsize)+8/(bpp)-1)/(8/(bpp)))
char* OCR::getBoxText(const unsigned char* imagedata,
int width, int height, int bpp){
int bytes_per_pixel = bpp / 8;
int bytes_per_line = COMPUTE_IMAGE_XDIM(width,bpp);
_tessAPI.SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
_tessAPI.Recognize(0);
char *boxtext = _tessAPI.GetBoxText(0);
/*
char* text = TessBaseAPI::TesseractRectBoxes(imagedata,
bytes_per_pixel,
bytes_per_line, 0, 0,
width,
height,
height);
*/
return boxtext;
}
char* OCR::getText(const unsigned char* imagedata,
int width, int height, int bpp){
int bytes_per_pixel = bpp / 8;
int bytes_per_line = COMPUTE_IMAGE_XDIM(width,bpp);
_tessAPI.SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
_tessAPI.Recognize(0);
char *text = _tessAPI.GetUTF8Text();
return text;
}
OCRRect::OCRRect(int x_, int y_, int width_, int height_)
: x(x_), y(y_), width(width_), height(height_){};
OCRRect::OCRRect(){
x = -1;
y = -1;
width = -1;
height = -1;
}
void
OCRRect::addOCRRect(const OCRRect& rect){
if (width < 0 && height < 0){
x = rect.x;
y = rect.y;
height = rect.height;
width = rect.width;
}else{
int left = x < rect.x ? x : rect.x;
int top = y < rect.y ? y : rect.y;
int lhs = x + width;
int rhs = rect.x + rect.width;
int right = lhs > rhs ? lhs : rhs;
lhs = y + height;
rhs = rect.y + rect.height;
int bottom = lhs > rhs ? lhs : rhs;
x = left; y = top; width = right - left; height = bottom - top;
}
}
void
OCRWord::add(const OCRChar& ocr_char){
addOCRRect(ocr_char);
ocr_chars_.push_back(ocr_char);
}
string
OCRWord::str(){
string ret = "";
for (vector::iterator it = ocr_chars_.begin(); it != ocr_chars_.end(); ++it){
ret = ret + it->ch;
}
return ret;
}
vector
OCRWord::getChars(){
return ocr_chars_;
}
string
OCRWord::getString(){
return str();
}
void
OCRWord::clear() {
width = -1; height = -1;
ocr_chars_.clear();
};
bool
OCRWord::isValidWord(){
return OCR::_tessAPI.IsValidWord(str().c_str());
}
void
OCRLine::addWord(OCRWord& ocr_word){
addOCRRect(ocr_word);
ocr_words_.push_back(ocr_word);
}
vector
OCRLine::getWords(){
return ocr_words_;
}
string
OCRLine::getString(){
if (ocr_words_.empty())
return string("");
string ret;
ret = ocr_words_.front().getString();
for (vector::iterator it = ocr_words_.begin()+1;
it != ocr_words_.end(); ++it){
OCRWord& word = *it;
ret = ret + " " + word.getString();
}
return ret;
}
void
OCRParagraph::addLine(OCRLine& ocr_line){
addOCRRect(ocr_line);
ocr_lines_.push_back(ocr_line);
}
vector
OCRParagraph::getLines(){
return ocr_lines_;
}
//void
//OCRText::add(OCRWord& ocr_word){
// ocr_words_.push_back(ocr_word);
//}
//
//void
//OCRText::addLine(OCRLine& ocr_line){
// ocr_lines_.push_back(ocr_line);
//}
void
OCRText::save(const char* filename){
// TODO: reimplement
//
// ofstream of(filename);
//
// for (iterator it = begin();
// it != end(); ++it){
//
// of << it->str() << " ";
// }
//
// of.close();
}
void
OCRText::save_with_location(const char* filename){
vector words = getWords();
ofstream of(filename);
for (vector::iterator it = words.begin();
it != words.end(); ++it){
OCRWord& w = *it;
of << w.x << " " << w.y << " " << w.width << " " << w.height << " ";
of << w.getString() << " ";
of << endl;
}
of.close();
}
void
OCRText::addParagraph(OCRParagraph& ocr_paragraph){
addOCRRect(ocr_paragraph);
ocr_paragraphs_.push_back(ocr_paragraph);
}
vector
OCRText::getLineStrings(){
vector line_strings;
for (vector::iterator it = ocr_paragraphs_.begin();
it != ocr_paragraphs_.end(); ++it){
OCRParagraph& para = *it;
for (vector::iterator it1 = para.getLines().begin();
it1 != para.getLines().end(); ++it1){
OCRLine& line = *it1;
string line_string = line.getString();
line_strings.push_back(line_string);
}
}
return line_strings;
}
vector
OCRText::getWords(){
vector ret_words;
for (vector::iterator it = ocr_paragraphs_.begin();
it != ocr_paragraphs_.end(); ++it){
vector lines = it->getLines();
for (vector::iterator it1 = lines.begin();
it1 != lines.end(); ++it1){
vector words = it1->getWords();
for (vector::iterator it2 = words.begin();
it2 != words.end(); ++it2){
OCRWord word = *it2;
ret_words.push_back(word);
}
}
}
return ret_words;
}
vector
OCRText::getParagraphs(){
return ocr_paragraphs_;
}
vector
OCRText::getWordStrings(){
vector word_strings;
for (vector::iterator it = ocr_paragraphs_.begin();
it != ocr_paragraphs_.end(); ++it){
vector lines = it->getLines();
for (vector::iterator it1 = lines.begin();
it1 != lines.end(); ++it1){
vector words = it1->getWords();
for (vector::iterator it2 = words.begin();
it2 != words.end(); ++it2){
OCRWord& word = *it2;
word_strings.push_back(word.getString());
}
// add new line
word_strings.push_back("\n");
}
}
return word_strings;
}
string
OCRText::getString(){
vector word_strings;
word_strings = getWordStrings();
if (word_strings.empty())
return "";
string ret = word_strings.front();
for (vector::iterator it = word_strings.begin() + 1;
it != word_strings.end(); ++it){
ret = ret + *it + " ";
}
return ret;
}
char
encode(char ch){
char code;
if (ch >= '0' && ch <= '9')
code = ch - '0' + 2;
else if (ch >= 'a' && ch <= 'z')
code = ch - 'a' + 12;
else if (ch >= 'A' && ch <= 'Z')
code = ch - 'A' + 12;
else
code = 0;
return code;
}
// produce a new image 200% the size of the given image
unsigned char* x2(const unsigned char* imagedata,
int width, int height, int bpp){
int bytes_per_pixel = bpp / 8;
unsigned char* newimage = new unsigned char[width*height*4];
const unsigned char* p = imagedata;
unsigned char* q = newimage;
for (int k=0;k(env_datapath.c_str()));
#else
//putenv on Mac breaks the "open" command somehow.
//we have to use setenv instead.
setenv("TESSDATA_PREFIX", datapath, 1);
#endif
int ret = _tessAPI.Init(datapath, _lang.c_str());
//TODO
//int ret = _tessAPI.Init(datapath, _lang.c_str(), OEM_TESSERACT_ONLY);
// _tessAPI.SetAccuracyVSpeed(AVS_MOST_ACCURATE); // FIXME: doesn't work?
isInitialized = true;
}
#include "cvgui.h"
using namespace cv;
#define MAXLEN 80
static int findMin(int d1, int d2, int d3) {
/*
* return min of d1, d2 and d3.
*/
if(d1 < d2 && d1 < d3)
return d1;
else if(d1 < d3)
return d2;
else if(d2 < d3)
return d2;
else
return d3;
}
static int
findEditDistanceLessThanK(const char *s1, const char *s2,
int k){
/*
* returns edit distance between s1 and s2.
*/
int d1, d2, d3;
if(*s1 == 0)
return strlen(s2);
if(*s2 == 0)
return strlen(s1);
if (k == 0)
return 0;
if(*s1 == *s2)
d1 = findEditDistanceLessThanK(s1+1, s2+1, k);
else
d1 = 1 + findEditDistanceLessThanK(s1+1, s2+1, k-1); // update.
d2 = 1+findEditDistanceLessThanK(s1, s2+1, k-1); // insert.
d3 = 1+findEditDistanceLessThanK(s1+1, s2, k-1); // delete.
return findMin(d1, d2, d3);
}
static int findEditDistance(const char *s1, const char *s2) {
/*
* returns edit distance between s1 and s2.
*/
int d1, d2, d3;
if(*s1 == 0)
return strlen(s2);
if(*s2 == 0)
return strlen(s1);
if(*s1 == *s2)
d1 = findEditDistance(s1+1, s2+1);
else
d1 = 1 + findEditDistance(s1+1, s2+1); // update.
d2 = 1+findEditDistance(s1, s2+1); // insert.
d3 = 1+findEditDistance(s1+1, s2); // delete.
return findMin(d1, d2, d3);
}
void sharpen(Mat& img){
Mat blur;
GaussianBlur(img, blur, cv::Size(0, 0), 5);
addWeighted(img, 2.5, blur, -1.5, 0, img);
}
float preprocess_for_ocr(const Mat& in_img, Mat& out_img){
const float MIN_HEIGHT = 30;
float scale = 1.f;
if (in_img.rows < MIN_HEIGHT){
scale = MIN_HEIGHT / float(in_img.rows);
resize(in_img, out_img, Size(in_img.cols*scale,in_img.rows*scale));
//TODO
//resize(in_img, out_img, Size(in_img.cols*scale,in_img.rows*scale), 0, 0, INTER_CUBIC);
//copyMakeBorder (in_img, out_img, 0, (scale-1)*in_img.rows, 0, (scale-1)*in_img.cols, BORDER_REPLICATE);
}else {
out_img = in_img;
}
sharpen(out_img);
//imshow("ocrImage", out_img);
return scale;
}
string OCR::recognize_as_string(const Mat& blobImage){
Mat gray, ocrImage; // the image passed to tesseract
OCR::init();
cvtColor(blobImage, gray, CV_RGB2GRAY);
preprocess_for_ocr(gray, ocrImage);
//imshow("ocr", ocrImage); waitKey();
char* text = getText((unsigned char*)ocrImage.data,
ocrImage.cols,
ocrImage.rows,
8);
if(text){
string ret = string(text);
delete [] text;
return ret;
}
return "";
}
vector getWordsFromImage(const Mat& screen, const Blob& blob){
Mat blobImage(screen,blob);
Mat ocrImage; // the image passed to tesseract
float scale = preprocess_for_ocr(blobImage, ocrImage);
vector ocr_words;
ocr_words = OCR::recognize_to_words((unsigned char*)ocrImage.data,
ocrImage.cols,
ocrImage.rows,
8);
for (vector::iterator iter = ocr_words.begin();
iter != ocr_words.end(); iter++){
OCRWord& word = *iter;
if(scale>1.f){
// scale back the coordinates in the OCR result
word.x = word.x/scale;
word.y = word.y/scale;
word.width = word.width/scale;
word.height = word.height/scale;
}
word.x += blob.x;
word.y += blob.y;
}
return ocr_words;
}
vector run_ocr(const Mat& screen, const Blob& blob){
Mat blobImage(screen,blob);
Mat ocrImage; // the image passed to tesseract
float scale = preprocess_for_ocr(blobImage, ocrImage);
vector ocr_chars;
ocr_chars = OCR::recognize((unsigned char*)ocrImage.data,
ocrImage.cols,
ocrImage.rows,
8);
for (vector::iterator iter = ocr_chars.begin();
iter != ocr_chars.end(); iter++){
OCRChar& ocrchar = *iter;
if(scale>1.f){
// scale back the coordinates in the OCR result
ocrchar.x = ocrchar.x/scale;
ocrchar.y = ocrchar.y/scale;
ocrchar.width = ocrchar.width/scale;
ocrchar.height = ocrchar.height/scale;
}
ocrchar.x += blob.x;
ocrchar.y += blob.y;
}
return ocr_chars;
}
void
find_phrase_helper(const Mat& screen_gray, vector words, vector lineblobs,
LineBlob resultblob, vector& results, bool is_find_one = true){
string word = words[0];
vector rest;
for (vector::iterator it2 = words.begin()+1;
it2 != words.end(); ++ it2)
rest.push_back(*it2);
dhead("find_phrase") << "<" << word << ">" << endl;
vector lineblobs_thisround = lineblobs;
for (int r = 0; r < 3; ++r){
for (int tolerance = 0; tolerance < 3; ++tolerance){
vector lineblobs_nextround;
for (vector::iterator it = lineblobs_thisround.begin();
it != lineblobs_thisround.end(); ++it){
LineBlob lineblob = *it;
if (abs((int)lineblob.blobs.size() - (int)word.size()) > tolerance){
lineblobs_nextround.push_back(lineblob);
continue;
}
dhead("find_phrase") << lineblob.x << "," << lineblob.y << "," << lineblob.width << "," << lineblob.height << endl;
vector ocr_chars = run_ocr(screen_gray, lineblob);
dhead("find_phrase") << word << "<->";
string ocrword = "";
for (vector::iterator iter = ocr_chars.begin();
iter != ocr_chars.end(); iter++){
OCRChar& ocrchar = *iter;
dout("find_phrase") << ocrchar.ch;
ocrword = ocrword + ocrchar.ch;
}
if (ocr_chars.size() < 1){
dout("find_phrase") << endl;
continue;
}
int d = findEditDistanceLessThanK(word.c_str(), ocrword.c_str(),3);
dout("find_phrase") << '[' << d << ']';
if (d > 2){
dout("find_phrase") << endl;
lineblobs_nextround.push_back(lineblob);
continue;
}
if (rest.empty()){
dout("find_phrase") << " ... match!" << endl;
//Blob b = resultblob;
//dout("find_phrase") << b.x << "," << b.y << endl;
//b = lineblob;
//dout("find_phrase") << b.x << "," << b.y << endl;
resultblob.merge(lineblob);
FindResult result(resultblob.x,resultblob.y,
resultblob.width,resultblob.height, 1.0);
results.push_back(result);
return;
}
else
dout("find_phrase") << endl;
vector nextblobs;
for (vector::iterator it2 = lineblobs.begin();
it2 != lineblobs.end(); ++it2){
LineBlob& b1 = lineblob;
LineBlob& b2 = *it2;
bool similar_baseline = abs((b1.y + b1.height) - (b2.y + b2.height)) < 5;
bool close_right = (b2.x > b1.x) && (b2.x - (b1.x+b1.width)) < 20;
bool close_below = (b2.y > b1.y) && (b2.y - b1.y) < 20;
if (close_right && similar_baseline)
nextblobs.push_back(b2);
}
if (!rest.empty() && !nextblobs.empty()){
LineBlob next_resultblob = resultblob;
next_resultblob.merge(lineblob);
find_phrase_helper(screen_gray, rest, nextblobs, next_resultblob, results, is_find_one);
}
dout("find_phrase") << endl;
// check if we have already found one match
if (is_find_one && results.size() >= 1)
// if so, we return the reuslts right away
return;
}
lineblobs_thisround = lineblobs_nextround;
}
}
}
int
OCR::findEditDistance(const char *s1, const char *s2,
int k){
return findEditDistanceLessThanK(s1,s2,k);
}
vector
OCR::find_phrase(const Mat& screen, vector words, bool is_find_one){
vector lineblobs;
cvgui::getLineBlobsAsIndividualWords(screen, lineblobs);
Mat screen_gray;
cvtColor(screen,screen_gray,CV_RGB2GRAY);
vector results;
LineBlob empty;
find_phrase_helper(screen_gray, words, lineblobs, empty, results, is_find_one);
return results;
}
vector
OCR::find_word(const Mat& screenshot, string word, bool is_find_one){
vector words;
words.push_back(word);
return find_phrase(screenshot, words, is_find_one);
}
OCRText
OCR::recognize_screenshot(const char* screenshot_filename){
Mat screenshot = imread(screenshot_filename, 1);
return recognize(screenshot);
}
OCRLine
linkOCRCharsToOCRLine(const vector& ocrchars){
OCRLine ocrline;
OCRWord ocrword;
int previous_spacing = 1000;
int next_spacing = 1000;
for (vector::const_iterator it = ocrchars.begin();
it != ocrchars.end(); it++){
const OCRChar& ocrchar = *it;
int current_spacing = 0;
if (it > ocrchars.begin()){
const OCRChar& previous_ocrchar = *(it-1);
current_spacing = ocrchar.x - (previous_ocrchar.x + previous_ocrchar.width);
//cout << '[' << ocrchar.height << ':' << spacing << ']';
//cout << '[' << spacing << ']';
}
if (it < ocrchars.end() - 1){
const OCRChar& next_ocrchar = *(it+1);
next_spacing = next_ocrchar.x - (ocrchar.x + ocrchar.width);
// if (current_spacing > next_spacing + 1){// || spacing >= 2){
// ocrline.addWord(ocrword);
// ocrword.clear();
// //cout << ' ';
// }
}
if (current_spacing > previous_spacing + 2 ||
current_spacing > next_spacing + 2){
ocrline.addWord(ocrword);
ocrword.clear();
//cout << ' ';
}
previous_spacing = current_spacing;
ocrword.add(ocrchar);
//cout << ocrchar.ch;
}
if (!ocrword.empty())
ocrline.addWord(ocrword);
return ocrline;
}
OCRLine
recognize_line(const cv::Mat& screen_gray, const LineBlob& lineblob){
Blob b(lineblob);
vector words = getWordsFromImage(screen_gray, lineblob);
OCRLine line;
for(vector::iterator it = words.begin(); it != words.end(); ++it)
line.addWord(*it);
return line;
}
/*
OCRLine
recognize_line(const cv::Mat& screen_gray, const LineBlob& lineblob){
Blob b(lineblob);
//Util::growRect(b, 2, 2, screen_gray);
vector ocrchars = run_ocr(screen_gray, b);
OCRLine ocrline = linkOCRCharsToOCRLine(ocrchars);
return ocrline;
}
*/
OCRParagraph
recognize_paragraph(const cv::Mat& screen_gray, const ParagraphBlob& parablob){
OCRParagraph ocrparagraph;
for (vector::const_iterator it = parablob.begin();
it != parablob.end(); ++it){
const LineBlob& lineblob = *it;
OCRLine ocrline = recognize_line(screen_gray, lineblob);
if (!ocrline.getWords().empty())
ocrparagraph.addLine(ocrline);
}
return ocrparagraph;
}
OCRText
OCR::recognize(cv::Mat screen){
OCRText ocrtext;
vector parablobs;
cvgui::getParagraphBlobs(screen, parablobs);
Mat screen_gray;
if(screen.channels()>1)
cvtColor(screen,screen_gray,CV_RGB2GRAY);
else
screen_gray = screen;
for (vector::iterator it = parablobs.begin();
it != parablobs.end(); ++it){
ParagraphBlob& parablob = *it;
OCRParagraph ocrpara;
ocrpara = recognize_paragraph(screen_gray, parablob);
ocrtext.addParagraph(ocrpara);
}
//TODO: VISUAL LOGGING
//Mat dark = screen * 0.2;
//Painter::drawOCRText(dark, ocrtext);
//VLOG("OCR-result", dark);
return ocrtext;
}
vector
OCR::recognize(const unsigned char* imagedata,
int width, int height, int bpp){
OCR::init();
vector ret;
char* boxtext = getBoxText(imagedata,width,height,bpp);
//Result ocr_result;
if (boxtext){
stringstream str(boxtext);
string ch;
int x0,y0,x1,y1, page;
while (str >> ch >> x0 >> y0 >> x1 >> y1 >> page){
//cout << ch << " " << x0 << " " << y0 << " " << x1 << " " << y1 << endl;
//convert back to the screen coordinate (0,0) - (left,top)
int h = y1 - y0;
int w = x1 - x0;
OCRChar ocr_char(ch,x0,height-y1,w,h);
ret.push_back(ocr_char);
};
delete [] boxtext;
}
return ret;
}
vector
OCR::recognize_to_words(const unsigned char* imagedata,
int width, int height, int bpp){
OCR::init();
vector ret;
vector chars = OCR::recognize(imagedata, width, height, bpp);
char *text = _tessAPI.GetUTF8Text();
//cout << "chars: " << chars.size() << endl;
//cout << "UTF8Text: [" << text << "]\n";
int *scores = _tessAPI.AllWordConfidences();
char *p_ch = text;
OCRWord word;
for(vector::iterator it = chars.begin(); it != chars.end(); ){
int len = it->ch.length();
if(*p_ch != ' ' && *p_ch != '\n'){
word.add(*it);
++it;
}
else{
if(!word.empty()){
//cout << "add " << word.str() << endl;
ret.push_back(word);
word.clear();
}
}
p_ch += len;
}
if(!word.empty())
ret.push_back(word);
int i;
for(i=0;i= 0;i++){
ret[i].score = scores[i]/100.f;
//cout << ret[i].str() << " " << ret[i].score << endl;
}
while(scores[i]>=0) i++;
if(ret.size() != i){
// cerr << "WARNING: num of words not consistent!: "
// << "#WORDS: " << ret.size() << " " << i << endl;
}
return ret;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy