All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.jmatchparser.util.charset.ToAsciiMapping Maven / Gradle / Ivy

Go to download

A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.

The newest version!
/* 
 * ToAsciiMapping.java - convert special characters to ASCII
 * 
 * Copyright (c) 2009 - 2011 Michael Schierl
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *   
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *   
 * - Neither name of the copyright holders nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *   
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package net.sf.jmatchparser.util.charset;

/**
 * Utility class to map a Unicode string to an ASCII string, if possible.
 * 
 * 

* Replacements used: *


:         
!:!¡ǁǂǃ‼︕﹗!
":"ʺ̎“”„❝❞〝〞&# * 12319;"
#:#﹟#
$:$﹩$
%:%٪﹪%
&:&﹠&
': * 'ƾʔʕʖʡʢʹʼˈ˙ˤ̇ * ;̍΄ᴤ’′❜'
(:(⁽₍⌠︵﹙(
):)⁾₎⌡︶﹚)
*:*∗⊛﹡*
+: * +±⁺₊∓┐├┤┴┼╒ * ╓ * ;╕╖╗╘╙╛╜╪╫╬& * #9792; * ⨢⨣⨤⨥⨦⨧⨨﬩﹢ * +
,:,︐﹐,
-: * -¬¯ˉ̄̅гґғҕӷӻ * ;& * #3603;ปผฟษฺู‐‑—↔ * &# * 8622;⇹⇼−⌐─┌┘╔╝╤& * #9573 * ;╧╨╩▀▄█░▒▓▬&# * 10793;⨪⨫⨬⸚ー﹣-
.:.·•․∙⋅◘⦿・﹒&# * 65294;
/:/´ˊ́⁄∕⊘⧶/
0: * 0°˚̊⁰₀℺∘⊚⓪○&# * 9689 * ;◴◵◶◷☼⚆⚇⧬㍘0 * ○
1: * 1¹¼½₁⅟①⑴⒈㋀㍙ * ㏠1
2: * 2²Ƨƨƻ₂②⑵⒉㋁㍚& * #13281;2
3: * 3³¾₃③⑶⒊㋂㍛㏢3 *
4:4⁴₄④⑷⒋㋃㍜㏣4
5: * 5Ƽƽ⁵₅⑤⑸⒌㋄㍝㏤ * ;5
6:6⁶₆⑥⑹⒍㋅㍞㏥6
7:7⁷₇⑦⑺⒎㋆㍟㏦7
8:8⁸₈∞⑧⑻⒏㋇㍠㏧&# * 65304;
9:9⁹₉⑨⑼⒐㋈㍡㏨9
:::։∶︓﹕:
;:;;︔﹔;
<: * <«‹←↚↢↩↫⇷⇺≪ * ≮ * ;〈◄⥆⬐⬑⬸⬹⬺〈& * #12298;﹤<
=: * =อ‗⁼₌≠≡≤≥═⩦ * ;⩷⪮〛﹦=
>: * >»›→↛↣↪↬↴⇸⇻ * ≫ * ;≯〉►♂⤑⤔⤕⥅⬎⬏ * ;〉》﹥>
?:?¿︖﹖?
@:@¤₪﹫@
A: * AÀÁÂÃÄÅÆĀĂĄǍǞ * ;& * #480;ǢǺǼȀȂȦȺᴀᴬᴭḀ& * #7840 * ;ẢẤẦẨẪẬẮẰẲẴ&# * 7862;ÅⒶ㎂A
B: * BƁƂƄɃʙᴃᴮᴯḂḄḆ * ;ℬⒷB
C: * CÇĆĈĊČƇȻʗᴄḈℂ * ℃ℭⅭⒸC
D: * DÐĎĐƉƊƋDŽDžDzᴅᴰ& * #7690;ḌḎḐḒⅅⅮⒹD
E: * EÈÉÊËĒĔĖĘĚƎƏƐ * ;ȄȆȨɆᴇᴱᴲḔḖḘḚ&# * 7708;ẸẺẼẾỀỂỄỆℇℰ&# * 9402;E
F:FƑΦḞ℉ℱℲⒻ㎌F
G: * GĜĞĠĢƓƔǤǦǴɢʛΓ * ;ᴳᷛḠℾ⅁ⒼꞠG
H:HĤĦȞʜᴴḢḤḦḨḪ&# * 8459;ℌℍⒽⱧH
I: * IÌÍÎÏĨĪĬĮİIJƖƗ * ;ǏȈȊɪᴵᶦḬḮỈỊℐ&# * 8465;ⅠⒾI
J:JĴɈᴊᴶⒿJ
K: * KĶƘǨᴋᴷḰḲḴKⓀ&# * 11369;ꝀꝂꝄꞢK
L: * L£ĹĻĽĿŁLjȽʟภศ& * #7436 * ;ᴥᴸᵜᶫᷞḶḸḺḼ₤&# * 8466; * ⅂⅃Ⅼ∟⊾⍼Ⓛ└╚Ⱡ&# * 11362;ꝈL£
M: * MƜᴍᴹᷟḾṀṂℳⅯⓂ * Ɱ㏁M
N: * NÑŃŅŇŊƝNjǸȠɴᴎ&# * 7482 * ;ᴻᶰᷡṄṆṈṊℕⓃꞐ&# * 42916;N
O: * OÒÓÔÕÖØŌŎŐŒƆƟ * ;& #416;ǑǪǬǾȌȎȪȬȮȰʘ&# * 911 ;& * #937;ᴏᴐᴼṌṎṐṒỌỎỐ& * #7890 * ;ỔỖỘỚỜỞỠỢὨὩ&# * 8042; * ὫὬὭὮὯᾨᾩᾪᾫᾬᾭ * ;& #8110;ᾯῺΏῼΩ∅Ⓞ☺☻&# * 10673 ;⦲⦳⦴ꝊꝌO
P: * P¶ÞƤᴘᴾṔṖ₧℘ℙ&# * 9413;ⱣꝐꝒꝔꝤꝦP
Q:QℚⓆꝖꝘQ
R: * RŔŖŘȐȒɌʀʶᴙᴚᴿ * ᷢ * ;ṘṚṜṞℛℜℝⓇⱤꞦ * ;R
S: * S§ŚŜŞŠƩȘΣϹṠṢ * ṤṦṨⓈⱾꞨS
T: * TŢŤŦƬƮȚȾΘϴᴛᵀ * ṪṬṮṰⓉ┬╦T
U: * UÙÚÛÜŨŪŬŮŰŲƯǓ * ;& #469;ǗǙǛȔȖᴜᵁᶸṲṴ&# * 7798 ;& * #7800;ṺỤỦỨỪỬỮỰⓊU *
V:VƲᴠṼṾⅤⓋⱽ㎶Ꝟ&# * 65334;
W: * WŴᴡᵂẀẂẄẆẈⓌⱲ * ;㎼W
X:XẊẌⅩⓍX
Y: * Y¥ÝŶŸƳȲɎʏẎỲỴ * ỶỸỾ⅄ⓎY¥
Z:ZŹŻŽƵƷƸƹǮǯȜȤ&# * 7458;ẐẒẔℤℨⓏⱫⱿZ
[:[⁅⦋⦍⦏〚﹇[
\:\∖⦸⧷﹨\
]:]⁆⦌⦎⦐﹈]
^: * ^¨˄ˆˇ˘˝̂̆̈̋̌΅ * ;″↑↰↱⇞⌃▲▼⤉^
_:_¸ˍ˛̧̨̱̲︳︴﹍&# * 65102;﹏_
`:`ʻˋ̀`‘‵❛`
a: * aªàáâãäåæāăąǎ * ;& #479;ǡǣǻǽȁȃȧɐɑɒͣ&# * 940 ;& * #945;ᵃᵄᵅᶏᶐᶛᷔḁẚạ& * #7843 * ;ấầẩẫậắằẳẵặ&# * 7936; * ἁἂἃἄἅἆἇὰάᾀᾁ * ;& * #8066;ᾃᾄᾅᾆᾇᾰᾱᾲᾳᾴ * ᾶᾷₐ⒜ⓐⱥa
b:bƀƃƅɓᵇᵬᶀḃḅḇ&# * 9373;ⓑb
c: * c¢çćĉċčƈȼɕͨᶜ&# * 7581;ḉ₡ⅽ⒞ⓒc¢
d: * dðďđƌƍdžȡɖɗͩδ&# * 7496;ᵟᵭᶁᶑᶞᷙḋḍḏḑ&# * 7699;ⅆⅾ⒟ⓓ♪♫㎗d
e: * eèéêëēĕėęěǝȅ&# * 519; * ȩɇɘəɚɛɜɝɞʚͤέ&# * 949; ϵᴈᵉᵊᵋᵌᶒᶓᶔᶕ&# * 7583 ;& * #7701;ḗḙḛḝẹẻẽếềể * &# * 7877;ệἐἑἒἓἔἕὲέₑ& * #8340;℮ℯⅇ⒠ⓔⱸe
f: * fƒφϕᵠᵩᵮᶂᶠḟ⒡& * #9429;f
g: * gĝğġģǥǧǵɠɡɣˠ&# * 7501 * ;ᶃᶢᷚḡℊ⒢ⓖ㎍ꞡg
h: * hĥħȟɥɦɧʮʯʰʱͪ&# * 1211;ԧᶣḣḥḧḩḫẖₕℎ&# * 8463;⒣ⓗⱨh
i: * iìíîïĩīĭįıijǐ&# * 521; * ȋɨɩͥᴉᵎᵢᵼᶖᶤᶥ& * #7725;ḯỉịⁱℹⅈⅰ⒤ⓘi *
j: * jĵǰɉɟʄʝʲᶡᶨⅉ⒥ * ;ⓙⱼj
k:kķĸƙǩʞᵏᶄᷜḱḳ&# * 7733;ₖ⒦ⓚⱪ㎘㏀ꝁꝃꝅ&# * 42915;k
l: * lĺļľŀłƚƛȴɫɬɭ&# * 737; ᶅᶩᶪᷝḷḹḻḽₗℓ&# * 8572 ;⒧ⓛⱡꝉꞎl
m: * mµɯɰɱͫμᴟᵐᵚᵯᶆ * ;ᶬᶭḿṁṃₘⅿ⒨ⓜ㎖&# * 13211;㎡㎥m
n: * nñńņňʼnŋƞǹȵɲɳ&# * 7505;ᵰᶇᶮᶯᷠṅṇṉṋⁿ&# * 8345;∩⒩ⓝ⩀⩃⩄ꞑꞥn
o: * oºòóôõöøōŏőœ&# * 417; * ǒǫǭǿȍȏȫȭȯȱɔɵ&# * 631; * ͦᴑᴒᴓᴖᴗᵒᵓᵔᵕᶗ * ;&# * 7601;ṍṏṑṓọỏốồổỗ * ộ * ;ớờởỡợₒℴ⒪ⓞⱺ * ꝋꝍo
p: * pþƥπϖᵖᵱᵽᶈṕṗ&# * 8346;ℼ⒫ⓟꝑꝓꝕꝥꝧp
q:qɋʠ⒬ⓠꝗꝙq
r: * rŕŗřȑȓɍɹɺɻɼɽ&# * 638;ɿʁʳʴʵͬᵣᵲᵳᶉṙ&# * 7771;ṝṟ⒭ⓡⱹꞧr
s: * sßśŝşšſƪșȿʂʃ&# * 645; * ʆˢβσϐᵝᵦᵴᶊᶋᶘ&# * 7603;ᶴᷤᷥṡṣṥṧṩẛẜ&# * 7837;ₛ⒮ⓢ㎲ꞩs
t: * tţťŧƫƭțȶʈʭͭτ&# * 7511;ᵵᶵṫṭṯṱẗₜ⒯ⓣ&# * 11366;ſtt
u: * uùúûüũūŭůűųư&# * 433; * ǔǖǘǚǜȕȗʉʊͧᴝᴞ * ᵘ * ;ᵙᵤᵿᶙᶶᶷṳṵṷṹ& * #7803;ụủứừửữự⒰ⓤu *
v: * vʋʌͮᵛᵥᶌᶹᶺṽṿ& * #8564 * ;↓↲↳↵⇟√⒱ⓥ⤈ⱱ * ⱴꝟv↓
w: * wŵƿǷʍʬʷẁẃẅẇẉ * ;ẘ⒲ⓦⱳw
x: * xˣͯᶍẋẍₓⅹ⒳ⓧx
y: * yýÿŷƴȳɏɤʇʎʸЎ& * #7823;ẙỳỵỷỹỿ⒴ⓨy
z: * zźżžƶƺȝȥɀʐʑʒ&# * 659; ᵶᶎᶚᶻᶼᶽᶾᷦẑẓ&# * 7829 ;⒵ⓩⱬz
{:{︷﹛{
|: * |¦ǀบฤ↕↨∣⌂⍿│ * ║ * ;╞╟╠╡╢╣▌▐■♠& * #9827;♥♦❘⫯⫰|
}:}︸﹜}
~: * ~˜̃∼≈⩪⩫⩯⸛⸞&# * 11807;~
* */ public class ToAsciiMapping { // generated by ToAsciiMappingGenerator.java static String[] MAPPINGS = { "", "\u00a1!cL@Y|S^", "\u00aaa<-", "\u00af-0+23/mP._1o>113?AAAAAAACEEEEIIIIDNOOOOO", "\u00d8OUUUUYPsaaaaaaaceeeeiiiidnooooo", "\u00f8ouuuuypyAaAaAaCcCcCcCcDdDdEeEeEeEeEeGgGgGgGgHhHhIiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnnNnOoOoOoOoRrRrRrSsSsSsSsTtTtTtUuUuUuUuUuUuWwYyYZzZzZzsbBBbBbOCcDDDddEEEFfGG", "\u0196IIKkllMNnOOo", "\u01a4Pp", "\u01a722SstTtTUuuVYyZzZZZz255\'w|!!!DDd", "\u01c8L", "\u01cbN", "\u01cdAaIiOoUuUuUuUuUueAaAaAaGgGgKkOoOoZZj", "\u01f2D", "\u01f4Gg", "\u01f7wNnAaAaOoAaAaEeEeIiIiOoOoRrRrUuUuSsTtZzHhNd", "\u0224ZzAaEeOoOoOoOoYylnt", "\u023aACcLTsz", "\u0243B", "\u0246EeJj", "\u024bqRrYyaaabocddeeeeeeejggGgyhhhiiIlll", "\u026fmmmnnNo", "\u0277o", "\u0279rrrrrrrRrssjssytuuvvwyYzzzz\'\'\'COBeGHjkLq\'\'", "\u02acwthhhhjrrrRwy\'\"`\'", "\u02c4^", "\u02c6^^\'-/`", "\u02cd_", "\u02d8^\'0_~^", "\u02e0glsx\'", "\u0300`/^~--^\'^", "\u030a0^^\'\"", "\u0327__", "\u0331__", "\u0363aeioucdhmrtvx", "\u037e;", "\u0384\'^", "\u038fO", "\u0393G", "\u0398T", "\u03a3S", "\u03a6F", "\u03a9O", "\u03acae", "\u03b1as", "\u03b4de", "\u03bcm", "\u03c0p", "\u03c3st", "\u03c6f", "\u03d0s", "\u03d5fp", "\u03f4Te", "\u03f9S", "\u040ey", "\u0433-", "\u0491-", "\u0493-", "\u0495-", "\u04bbh", "\u04f7-", "\u04fb-", "\u0527h", "\u0589:", "\u066a%", "\u0e13-", "\u0e1a|--", "\u0e1f-L", "\u0e24|", "\u0e28L-", "\u0e2d=", "\u0e39--", "\u1d00A", "\u1d03BCD", "\u1d07EeiJKLMNOOooo", "\u1d16ooPRRTUuumVWZ", "\u1d24\'L", "\u1d2cAABBDEEGHIJKLMNNO", "\u1d3ePRTUWaaa", "\u1d47bdeeeegikmnooooptuumvLs", "\u1d5fdf", "\u1d62iruvs", "\u1d69f", "\u1d6cbdfmnprrstz", "\u1d7cip", "\u1d7fubdfgklmnprssvxzaadeeeeiosuzaccdefjghiiI", "\u1da8jllLmmnnNo", "\u1db3sstuuUvvzzzz", "\u1dd4a", "\u1dd9dgGklLMnNR", "\u1de4ssz", "\u1e00AaBbBbBbCcDdDdDdDdDdEeEeEeEeEeFfGgHhHhHhHhHhIiIiKkKkKkLlLlLlLlMmMmMmNnNnNnNnOoOoOoOoPpPpRrRrRrRrSsSsSsSsSsTtTtTtTtUuUuUuUuUuVvVvWwWwWwWwWwXxXxYyZzZzZzhtwyasss", "\u1ea0AaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy", "\u1efeYyaaaaaaaa", "\u1f10eeeeee", "\u1f68OOOOOOOOaaee", "\u1f80aaaaaaaa", "\u1fa8OOOOOOOOaaaaa", "\u1fb6aa", "\u1fef`", "\u1ffaOOO", "\u2000 ", "\u2010--", "\u2014-", "\u2017=`\'", "\u201c\"\"\"", "\u2022.", "\u2024.", "\u2032\'^", "\u2035`", "\u2039<>", "\u203c!", "\u2044/[]", "\u20700i", "\u2074456789+", "\u207c=()n0123456789+", "\u208c=()", "\u2090aeoxehklmnpst", "\u20a1c", "\u20a4L", "\u20a7P", "\u20aa@", "\u2102CC", "\u2107E", "\u2109FgHHHhhIILl", "\u2115N", "\u2118PPQRRR", "\u2124Z", "\u2126O", "\u2128Z", "\u212aKABCeeEFFMo", "\u2139i0", "\u213cp", "\u213eG", "\u2141GLLYDdeij", "\u215f1I", "\u2164V", "\u2169X", "\u216cLCDMi", "\u2174v", "\u2179x", "\u217clcdm", "\u2190<^>v-|", "\u219a<>", "\u21a2<>", "\u21a8|<><>", "\u21ae-", "\u21b0^^vv>v", "\u21de^v", "\u21f7<>-<>-", "\u2205O", "\u2212-+", "\u2215/\\*0.v", "\u221e8L", "\u2223|", "\u2229n", "\u2236:", "\u223c~", "\u2248~", "\u2260==", "\u2264==", "\u226a<>", "\u226e<>", "\u2298/", "\u229a0*", "\u22beL", "\u22c5.", "\u2302|^", "\u2310-", "\u2320()", "\u2329<>", "\u237cL", "\u237f|", "\u2460123456789", "\u2474123456789", "\u2488123456789", "\u249cabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0", "\u2500-", "\u2502|", "\u250c-", "\u2510+", "\u2514L", "\u2518-", "\u251c+", "\u2524+", "\u252cT", "\u2534+", "\u253c+", "\u2550=|++-+++++L++-||||||--T---+++", "\u2580-", "\u2584-", "\u2588-", "\u258c|", "\u2590|---", "\u25a0|", "\u25ac-", "\u25b2^", "\u25ba>", "\u25bc^", "\u25c4<", "\u25cb0", "\u25d8.0", "\u25f40000", "\u263aOO0", "\u2640+", "\u2642>", "\u2660|", "\u2663|", "\u2665||", "\u266add", "\u268600", "\u2758|", "\u275b`\'\"\"", "\u2908v^", "\u2911>", "\u2914>>", "\u2945><", "\u298b[][][]", "\u29b1OOOO", "\u29b8\\", "\u29bf.", "\u29ec0", "\u29f6/\\", "\u2a22+++++++----", "\u2a40n", "\u2a43nn", "\u2a66=", "\u2a6a~~", "\u2a6f~", "\u2a77=", "\u2aae=", "\u2aef||", "\u2b0e>><<", "\u2b38<<<", "\u2c60LlLPRatHhKkZz", "\u2c6eM", "\u2c71vWwv", "\u2c78ero", "\u2c7cjVSZ", "\u2e1a-~", "\u2e1e~~", "\u3000 ", "\u3008<><>", "\u301a[=", "\u301d\"\"\"", "\u30fb.-", "\u32c0123456789", "\u33580123456789", "\u3382A", "\u338cFg", "\u3396mdk", "\u339bm", "\u33a1m", "\u33a5m", "\u33b2s", "\u33b6V", "\u33bcW", "\u33c0kM", "\u33e0123456789", "\ua740KkKkKk", "\ua748LlOoOo", "\ua750PpPpPpQqQq", "\ua75eVv", "\ua764PpPp", "\ua78el", "\ua790Nn", "\ua7a0GgKkNnRrSs", "\uf8c2-", "\ufb05t", "\ufb29+", "\ufe10,", "\ufe13:;!?", "\ufe33__(){}", "\ufe47[]", "\ufe4d___,", "\ufe52.", "\ufe54;:?!", "\ufe59(){}", "\ufe5f#&*+-<>=", "\ufe68\\$%@", "\uff01!\"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~", "\uffe0cL", "\uffe5Y", "\uffecv", "\uffee0" }; private static byte[] map = new byte[65536]; static { for (int i = 0; i < map.length; i++) { map[i] = i < 128 ? (byte) i : (byte) '?'; } for (int i = 1; i < MAPPINGS.length; i++) { int start = MAPPINGS[i].charAt(0); for (int j = 1; j < MAPPINGS[i].length(); j++) { map[start + j - 1] = (byte) MAPPINGS[i].charAt(j); } } } /** * Map a given string to ASCII. */ public static String mapToAscii(String text) { char[] resultBuffer = new char[text.length()]; for (int i = 0; i < text.length(); i++) { resultBuffer[i] = (char) map[text.charAt(i)]; } return new String(resultBuffer); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy