org.sikuli.script.OCR Maven / Gradle / Ivy
Show all versions of sikulixapi Show documentation
/*
* Copyright (c) 2010-2020, sikuli.org, sikulix.com - MIT license
*/
package org.sikuli.script;
import org.sikuli.basics.Debug;
import org.sikuli.basics.Settings;
import java.awt.Font;
import java.awt.FontMetrics;
import java.awt.Graphics;
import java.awt.Toolkit;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.*;
/**
* Static helper class for OCR via Tess4J/Tesseract.
*
* The methods in this class are not threadsafe.
* @see SikuliX docs: Text and OCR
*/
public class OCR {
//
/**
* OCR Engine modes.
*
* 0 TESSERACT_ONLY Tesseract Legacy only.
* 1 LSTM_ONLY LSTM only.
* 2 TESSERACT_LSTM_COMBINED LSTM + Legacy.
* 3 DEFAULT Default, based on what is available. (DEFAULT)
*
*/
public enum OEM {
TESSERACT_ONLY, // 0
LSTM_ONLY, // 1
TESSERACT_LSTM_COMBINED, // 2
DEFAULT // 3
}
/**
* Page segmentation modes.
*
* 0 OSD_ONLY Orientation and script detection (OSD) only.
* 1 AUTO_OSD Automatic page segmentation with OSD.
* 2 AUTO_ONLY Automatic page segmentation, but no OSD, or OCR.
* 3 AUTO Fully automatic page segmentation, but no OSD. (Default)
* 4 SINGLE_COLUMN Assume a single column of text of variable sizes.
* 5 SINGLE_BLOCK_VERT_TEXT Assume a single uniform block of vertically aligned text.
* 6 SINGLE_COLUMN Assume a single uniform block of text.
* 7 SINGLE_LINE Treat the image as a single text line.
* 8 SINGLE_WORD Treat the image as a single word.
* 9 CIRCLE_WORD Treat the image as a single word in a circle.
* 10 SINGLE_CHAR Treat the image as a single character.
* 11 SPARSE_TEXT Sparse text. Find as much text as possible in no particular order.
* 12 SPARSE_TEXT_OSD Sparse text with OSD.
* 13 RAW_LINE Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
*
*/
public enum PSM {
OSD_ONLY, // 0
AUTO_OSD, // 1
AUTO_ONLY, // 2
AUTO, // 3
SINGLE_COLUMN, // 4
SINGLE_BLOCK_VERT_TEXT, // 5
SINGLE_BLOCK, // 6
SINGLE_LINE, // 7
SINGLE_WORD, // 8
CIRCLE_WORD, // 9
SINGLE_CHAR, // 10
SPARSE_TEXT, // 11
SPARSE_TEXT_OSD, // 12
RAW_LINE // 13
}
/**
* INTERNAL: Tesseract option.
*/
protected static final int PAGE_ITERATOR_LEVEL_WORD = 3;
/**
* INTERNAL: Tesseract option.
*/
protected static final int PAGE_ITERATOR_LEVEL_LINE = 2;
//
//
private static Options options = new Options();
/**
* access/get the current global Options (Singleton).
*
* @return the global Options
*/
public static Options globalOptions() {
return options;
}
/**
* A container for the options relevant for using {@link OCR} on
* {@link Region}s or {@link Image}s.
* Use OCR.{@link #Options()} to get a new option set
* use OCR.{@link #globalOptions()} to access the global options
*
* In case you have to consult the Tesseract docs
* @see Tesseract docs
*/
public static class Options implements Cloneable {
//
/**
* create a new Options set from the initial defaults settings.
*
* about the default settings see {@link #reset()}
*/
public Options() {
reset();
}
/**
* makes a copy of this Options
* @return new Options as copy
*/
@Override
public Options clone() {
Options options = new Options();
options.oem = oem;
options.psm = psm;
options.language = language;
options.dataPath = dataPath;
options.textHeight = textHeight;
options.resizeInterpolation = resizeInterpolation;
options.variables = new LinkedHashMap<>(variables);
options.configs = new LinkedHashSet<>(configs);
options.bestDPI = bestDPI;
options.userDPI = userDPI;
return options;
}
/**
* resets this Options set to the initial defaults.
*
* oem = OcrEngineMode.DEFAULT.ordinal();
* psm = PageSegMode.AUTO.ordinal();
* language = Settings.OcrLanguage;
* dataPath = null; //(see comment)
* textHeight = getDefaultTextHeight();
* variables.clear();
* configs.clear();
*
* comment on dataPath==null: dataPath will be evaluated at the next use of an OCR feature
* to the SikuliX default or Settings.OcrDataPath (if set)
*
* @return this
*/
public Options reset() {
oem = OEM.DEFAULT.ordinal();
psm = PSM.AUTO.ordinal();
language = Settings.OcrLanguage;
dataPath = null;
textHeight = getDefaultTextHeight();
resizeInterpolation = Image.Interpolation.LINEAR;
variables.clear();
configs.clear();
bestDPI = null;
userDPI(TESSERACT_USER_DEFINED_DPI);
return this;
}
/**
* Current state of this Options as some formatted lines of text.
*
* OCR.Options:
* data = ...some-path.../tessdata
* language(eng) oem(3) psm(3) height(15,1) factor(1,99) dpi(96)
* configs: conf1, conf2, ...
* variables: key:value, ...
*
* @return a text string as before
*/
public String toString() {
String msg = String.format(
"OCR.Options:" +
"\ndata = %s" +
"\nlanguage(%s) oem(%d) psm(%d) height(%.1f) factor(%.2f) dpi(%d)",
dataPath(), language(), oem(), psm(),
textHeight(), factor(),
Toolkit.getDefaultToolkit().getScreenResolution());
if (hasVariablesOrConfigs()) {
msg += "\n" + logVariablesConfigs();
}
return msg;
}
/**
* INTERNAL: validates this Options before OCR usage.
*/
protected void validate() {
if (!new File(dataPath(), language() + ".traineddata").exists()) {
throw new SikuliXception(String.format("OCR: language: no %s.traineddata in %s",
language(), dataPath()));
}
}
//
//
private int oem;
/**
* get this OEM.
*
* @return oem as int
* @see OEM
*/
public int oem() {
return oem;
}
/**
* set this OEM.
*
* @param oem as int
* @return this Options
* @see OEM
*/
public Options oem(int oem) {
if (oem < 0 || oem > 3) {
throw new IllegalArgumentException(String.format("OCR: Invalid OEM %s (0 .. 3)", oem));
}
this.oem = oem;
return this;
}
/**
* set this OEM.
*
* @param oem as enum constant
* @return this Options
* @see OEM
*/
public Options oem(OEM oem) {
oem(oem.ordinal());
return this;
}
//
//
private int psm;
/**
* get this PSM.
*
* @return psm as int
* @see PSM
*/
public int psm() {
return psm;
}
/**
* set this PSM.
*
* @param psm as int
* @return this Options
* @see PSM
*/
public Options psm(int psm) {
if (psm < 0 || psm > 13) {
throw new IllegalArgumentException(String.format("OCR: Invalid PSM %s (0 .. 12)", psm));
}
if (psm == PSM.OSD_ONLY.ordinal() || psm == PSM.AUTO_OSD.ordinal()
|| psm == PSM.SPARSE_TEXT_OSD.ordinal()) {
if (!new File(dataPath(), "osd.traineddata").exists()) {
throw new IllegalArgumentException(String.format("OCR: setPSM(%d): needs OSD, " +
"but no osd.traineddata found in tessdata folder", psm));
}
}
this.psm = psm;
return this;
}
/**
* set this PSM.
*
* @param psm as enum constant
* @return this Options
* @see PSM
*/
public Options psm(PSM psm) {
psm(psm.ordinal());
return this;
}
/**
* Sets this PSM to -1.
*
* This causes Tess4J not to set the PSM at all.
*
Only use it, if you know what you are doing.
*
* @return this Options
*/
public Options resetPSM() {
psm = -1;
return this;
}
/**
* Configure Options to recognize a single line.
*
* @return this Options
*/
public Options asLine() {
return psm(PSM.SINGLE_LINE);
}
/**
* Configure Options to recognize a single word.
*
* @return this Options
*/
public Options asWord() {
return psm(PSM.SINGLE_WORD);
}
/**
* Configure Options to recognize a single character.
*
* @return this Options
*/
public Options asChar() {
return psm(PSM.SINGLE_CHAR);
}
//
//
private String language;
/**
* get the cutrrent language
* @return the language short string
* @see #language(String)
*/
public String language() {
return language;
}
/**
* Set the language short string.
* (must not be null or empty,
* see {@link Settings#OcrLanguage} for a useable fallback)
* According to the Tesseract rules this is a 3-lowercase-letters string
* like eng, deu, fra, rus, ....
* For special cases it might be something like xxx_yyy (chi_sim)
* or even xxx_yyyy (deu_frak) or even xxx_yyy_zzzz (chi_tra_vert), but always all lowercase.
* Take care that you have the corresponding ....traineddata file in the datapath/tessdata folder
* latest at time of OCR feature usage
* @see Tesseract language files
* @param language the language string
* @return this Options
*/
public Options language(String language) {
if (language == null || language.isEmpty()) {
throw new IllegalArgumentException(String.format("OCR: Invalid language %s", language));
}
//TODO check language string (RegEx?)
this.language = language;
return this;
}
//
//
protected static String defaultDataPath = null;
private String dataPath;
/**
* get the current datapath in this Options.
* might be null, if no OCR feature was used until now
* if null, it will be evaluated at time of OCR feature usage to the default
* SikuliX path or to Settings.OcrDataPath (if set)
* @return the current Tesseract datapath in this Options
*/
public String dataPath() {
if (dataPath == null) {
return defaultDataPath;
}
return dataPath;
}
/**
* Set folder for Tesseract to find language and configs files.
* in the tessdata subfolder (the path spec might be given without the trailing /tessdata)
* TAKE CARE, that all is in place at time of OCR feature usage
* if null, it will be evaluated at time of OCR feature usage to the default
* SikuliX path or to Settings.OcrDataPath (if set)
* @see #language(String)
* @param dataPath the absolute filename string
* @return this Options
*/
public Options dataPath(String dataPath) {
if (dataPath != null) {
if (!"tessdata".equals(new File(dataPath).getName())) {
dataPath = new File(dataPath, "tessdata").getAbsolutePath();
}
}
this.dataPath = dataPath;
return this;
}
//
//
/**
* Convenience: Configure the Option's optimization.
*
* Might give better results in cases with small
* fonts with a pixel height lt 12 (font sizes lt 10)
* @return this Options
*/
public Options smallFont() {
textHeight(10);
return this;
}
private static float getDefaultTextHeight() {
Graphics g = new BufferedImage(100, 100, BufferedImage.TYPE_INT_RGB).getGraphics();
try {
Font font = g.getFont();
FontMetrics fm = g.getFontMetrics(font);
return fm.getLineMetrics("X", g).getHeight();
} finally {
g.dispose();
}
}
private float textHeight;
private static final int OPTIMAL_X_HEIGHT = 30;
/**
* current base for image optimization before OCR.
* @return value
* @see #textHeight(float)
*/
public float textHeight() {
return textHeight;
}
/**
* Configure image optimization.
*
* should be the (in case average) height in pixels of an uppercase X in the image's text
*
NOTE: should only be tried in cases, where the defaults do not lead to acceptable results
* @param height a number of pixels
* @return this Options
*/
public Options textHeight(float height) {
textHeight = height;
return this;
}
/**
* Configure the image optimization.
*
* should be the (in case average) fontsize as base for internally calculating the {@link #textHeight()}
*
NOTE: should only be tried in cases, where the defaults do not lead to acceptable results
* @param size of a font
* @return this Options
*/
public Options fontSize(int size) {
Graphics g = new BufferedImage(100, 100, BufferedImage.TYPE_INT_RGB).getGraphics();
try {
Font font = new Font(g.getFont().getFontName(), 0, size);
FontMetrics fm = g.getFontMetrics(font);
textHeight(fm.getLineMetrics("X", g).getHeight());
return this;
} finally {
g.dispose();
}
}
private Image.Interpolation resizeInterpolation;
protected Image.Interpolation resizeInterpolation() {
return resizeInterpolation;
}
/**
* INTERNAL (under investigation).
* should not be used - not supported
* @param method {@link Image.Interpolation}
* @return this Options
*/
public Options resizeInterpolation(Image.Interpolation method) {
resizeInterpolation = method;
return this;
}
private Float bestDPI = null;
protected float bestDPI() {
return bestDPI;
}
/**
* INTERNAL (under investigation).
*
should not be used - not supported
* @param dpi the dpi value
* @return this Options
*/
public Options bestDPI(int dpi) {
bestDPI = (float) dpi;
return this;
}
private static final int TESSERACT_USER_DEFINED_DPI = 300;
private int userDPI;
/**
* INTERNAL (under investigation).
*
should not be used - not supported
* @param dpi 70 .. 2400
* @return this Options
*/
//TODO why is this needed? Tess4J/Tesseract produce a warning is not set or not 70 .. 2400
public Options userDPI(int dpi) {
if (dpi == 0) {
dpi = Toolkit.getDefaultToolkit().getScreenResolution();
}
if (dpi < 70 || dpi > 2400) {
throw new IllegalArgumentException(String.format("OCR: Invalid user DPI: %s (must be 70 .. 2400)", dpi));
}
userDPI = dpi;
variable("user_defined_dpi", Integer.toString(dpi));
return this;
}
protected float factor() {
// LEGACY: Calculate the resize factor based on the optimal and
// calculated DPI value if bestDPI has been set manually
if (bestDPI != null) {
return bestDPI / Toolkit.getDefaultToolkit().getScreenResolution();
}
return OPTIMAL_X_HEIGHT / textHeight;
}
//
//
private Map variables = new LinkedHashMap<>();
/**
* @return the currently stored variables
* @see #variable(String, String)
*/
public Map variables() {
return variables;
}
/**
* set a variable for Tesseract.
*
* you should know, what you are doing - consult the Tesseract docs
*
* @param key the key
* @param value the value
* @return this Options
* @see Tesseract docs
*/
public Options variable(String key, String value) {
variables.put(key, value);
return this;
}
//
//
private Set configs = new LinkedHashSet<>();
/**
* get current configs
* @return currently stored names of configs files
* @see #configs(String...)
*/
public List configs() {
return new ArrayList<>(configs);
}
/**
* set one ore more configs file names.
* you should know, what you are doing - consult the Tesseract docs
* @param configs one or more configs filenames
* @return this Options
* @see Tesseract docs
*/
public Options configs(String... configs) {
configs(Arrays.asList(configs));
return this;
}
/**
* set a list of configs file names.
*
you should know, what you are doing - consult the Tesseract docs
* @param configs a list of configs filenames
* @return this Options
* @see Tesseract docs
*/
public Options configs(List configs) {
this.configs = new LinkedHashSet<>(configs);
return this;
}
//
//
private boolean hasVariablesOrConfigs() {
return !configs.isEmpty() || !variables.isEmpty();
}
private String logVariablesConfigs() {
String logConfigs = "";
if (!logConfigs.isEmpty()) {
logConfigs = "configs: " + logConfigs;
}
String logVariables = "";
for (String key : variables.keySet()) {
if (!logVariables.isEmpty()) {
logVariables += ",";
}
logVariables += key + ":" + variables.get(key);
}
if (!logVariables.isEmpty()) {
logVariables = "variables: " + logVariables;
}
return (logConfigs + logVariables).trim();
}
//
}
//
//
/**
* Resets the global options to the initial defaults.
* @see OCR.Options#reset()
* @return the global Options
*/
public static Options reset() {
return globalOptions().reset();
}
/**
* prints out the current global options.
*/
public static void status() {
Debug.logp("Global settings " + globalOptions().toString());
}
//
//
/**
* Reads text from the given source.
* Uses the global options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @return text
*/
public static String readText(SFIRBS from) {
return readText(from, globalOptions());
}
/**
* Reads text from the given source.
* Uses the given options
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @param options Options to be used
* @return text
*/
public static String readText(SFIRBS from, Options options) {
return TextRecognizer.get(options).readText(from);
}
//
/**
* chapter info
*/
//
/**
* Reads text from the given source (line).
* assuming the source contains a single line of text.
*
Uses the global options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @return text
*/
public static String readLine(SFIRBS from) {
return readLine(from, globalOptions());
}
/**
* Reads text from the given source (line).
* assuming the source contains a single line of text.
*
Uses the given options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @param options options for the used TextRecognizer
* @return text
*/
public static String readLine(SFIRBS from, Options options) {
return readText(from, options.clone().asLine());
}
/**
* Reads text from the given source as lines.
* Uses the global options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @return lines as a list of matches
*/
public static List readLines(SFIRBS from) {
return readLines(from, globalOptions());
}
/**
* Reads text from the given source as lines.
* Uses the given options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @param options options for the used TextRecognizer
* @return lines
*/
public static List readLines(SFIRBS from, Options options) {
return TextRecognizer.get(options).readLines(from);
}
//
//
/**
* Reads text from the given source (word).
* assuming the source contains a single word of text.
*
Uses the global options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @return text
*/
public static String readWord(SFIRBS from) {
return readWord(from, globalOptions());
}
/**
* Reads text from the given source (word).
* assuming the source contains a single word of text.
*
Uses the given options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @param options options for the used TextRecognizer
* @return text
*/
public static String readWord(SFIRBS from, Options options) {
return readText(from, options.clone().asWord());
}
/**
* Reads text from the given source as words.
* Uses the global options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @return words as alist of matches
*/
public static List readWords(SFIRBS from) {
return readWords(from, OCR.globalOptions());
}
/**
* Reads text from the given source as words.
* Uses the given options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @param options options for the used TextRecognizer
* @return words as a list of matches
*/
public static List readWords(SFIRBS from, Options options) {
return TextRecognizer.get(options).readWords(from);
}
//
//
/**
* Reads text from the given source (character).
* assuming the source contains a single character.
*
Uses the global options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @return text
*/
public static String readChar(SFIRBS from) {
return readChar(from, globalOptions());
}
/**
* Reads text from the given source (character).
* assuming the source contains a single character.
*
Uses the given options.
* @param File name, File, Image, Region, BufferdImage or ScreenImage
* @param from source to read text from
* @param options options for the used TextRecognizer
* @return text
*/
public static String readChar(SFIRBS from, Options options) {
return readText(from, options.clone().asChar());
}
//
}