
jadeutils.text.HtmlUtil Maven / Gradle / Ivy
The newest version!
package jadeutils.text;
import java.util.HashMap;
import java.util.Map;
/**
* html 转为纯文本 保留格式
*
* @author: zhangZhiPeng
*/
public class HtmlUtil {
public static void main(String[] args) {
String content = html2text("The Nobel(撒娇的空间卡死的快乐? Peace Prize for 2008 was given to Martti Ahtisaari. "
+ "He was the president of Finland from 1994 to 2000. He won the prize for his work in solving international "
+ "conflicts for more than 30 years.
"
+ " his work has made a more peaceful world in Nobel spirit, the officer said, "
+ "he has won the prize.
" //
+ "理论基础
在这里1V可以定义为产生1A电流来做1W的功所需要的压力。如果脱离电子学来看:
"
+ " - 1W等于每秒1J的功。
- 1J等于1N的力的作用下,前进1m所需要的功。
- 1N等于每秒让1kg的物体加速1 m/s所需要的力。
"
+ " 毫瓦表示法 瓦特表示法 千瓦表示法 兆瓦表示法 "
+ " 1 mW 0.001 W 0.000,001 kW 0.000,000,001 MW "
+ " 1000 mW 1 W 0.001 kW 0.000,001 MW "
+ " 1,000,000 mW 1000 W 1 kW 0.001 MW "
+ " 1,000,000,000 mW 1000,000 W 1000 kW 1 MW
"
+ " 注意:毫瓦(mW)里的m是小写的,而兆瓦(MW)里的M是大写的。不要写错了。
");
System.out.println(content);
}
/**
* parse html to formatted text
*
* @param html
* html source
* @return formatted text
*/
public static String html2text(String html) {
StringBuffer sb = new StringBuffer(html.length());
char[] data = html.toCharArray();
int start = 0;
boolean previousIsPre = false;
HtmlToken token = null;
for (;;) {
token = parse(data, start, previousIsPre);
if (token == null)
break;
previousIsPre = token.isPreTag();
sb = sb.append(token.getText());
start += token.getLength();
}
return sb.toString();
}
/**
* parse html
*
* @param data
* data
* @param start
* start index
* @param previousIsPre
* previous
* @return html TOKEN
*/
private static HtmlToken parse(char[] data, int start, boolean previousIsPre) {
if (start >= data.length)
return null;
// try to read next char:
char c = data[start];
if (c == '<') {
// this is a tag or comment or script:
int end_index = indexOf(data, start + 1, '>');
if (end_index == (-1)) {
// the left is all text!
return new HtmlToken(HtmlToken.TOKEN_TEXT, data, start, data.length, previousIsPre);
}
String s = new String(data, start, end_index - start + 1);
// now we got s="<...>":
if (s.startsWith("");
if (end_comment_index == (-1)) {
// illegal end, but treat as comment:
return new HtmlToken(HtmlToken.TOKEN_COMMENT, data, start, data.length, previousIsPre);
} else {
return new HtmlToken(HtmlToken.TOKEN_COMMENT, data, start, end_comment_index + 3, previousIsPre);
}
}
String s_lowerCase = s.toLowerCase();
if (s_lowerCase.startsWith("