手机站
网通分站
电信主站
密 码:
用户名:
当前位置 : 主页>程序设计>Java技术>列表

将HTML转化为TEXT的Java类

来源:互联网 作者:west263.com 时间:2008-02-23
西部数码-全国虚拟主机10强!40余项虚拟主机管理功能,全国领先!双线多线虚拟主机南北访问畅通无阻!免费赠送企业邮局,.CN域名,自助建站480元起,免费试用7天,满意再付款! P4主机租用799元/月.月付免压金!

private static final char[] END_TAG_LI = "</li>".toCharArray();

private static final Map SPECIAL_CHARS = new HashMap();

private int type;
private String html; // original html
private String text = null; // text!
private int length = 0; // html length
private boolean isPre = false; // isPre tag?

static {
SPECIAL_CHARS.put("&quot;", "\"");
SPECIAL_CHARS.put("&lt;", "<");
SPECIAL_CHARS.put("&gt;", ">");
SPECIAL_CHARS.put("&amp;", "&");
SPECIAL_CHARS.put("&reg;", "(r)");
SPECIAL_CHARS.put("&copy;", "(c)");
SPECIAL_CHARS.put("&nbsp;", " ");
SPECIAL_CHARS.put("&pound;", "?");
}

public Token(int type, char[] data, int start, int end, boolean previousIsPre) {
this.type = type;
this.length = end - start;
this.html = new String(data, start, length);
System.out.println("[Token] html=" html ".");
parseText(previousIsPre);
System.out.println("[Token] text=" text ".");
}

public int getLength() {
return length;
}

public boolean isPreTag() {
return isPre;
}

private void parseText(boolean previousIsPre) {
if(type==TOKEN_TAG) {
char[] cs = html.toCharArray();
if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))
text = "\n";
else if(compareTag(TAG_LI, cs))
text = "\n* ";
else if(compareTag(TAG_PRE, cs))
isPre = true;
else if(compareTag(TAG_HR, cs))
text = "\n--------\n";
else if(compareString(END_TAG_TD, cs))
text = "\t";
else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))
text = "\n";
}
// text token:
else if(type==TOKEN_TEXT) {
text = toText(html, previousIsPre);
}
}

public String getText() {
return text==null ? "" : text;
}

private String toText(String html, final boolean isPre) {
char[] cs = html.toCharArray();
StringBuffer buffer = new StringBuffer(cs.length);
int start = 0;
boolean continueSpace = false;
char current, next;
for(;;) {
if(start>=cs.length)
break;
current = cs[start]; // read current char
if(start 1<cs.length) // and next char
next = cs[start 1];
else
next = '\0';
if(current==' ') {
if(isPre || !continueSpace)
buffer = buffer.append(' ');
continueSpace = true;
// continue loop:
start ;
continue;
}
// not ' ', so:
if(current=='\r' && next=='\n') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start =2;
continue;
}
if(current=='\n' || current=='\r') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start ;
continue;
}
// cannot continue space:
continueSpace = false;
if(current=='&') {
// maybe special char:
int length = readUtil(cs, start, ';', 10);
if(length==(-1)) { // just '&':
buffer = buffer.append('&');
// continue loop:
start ;
continue;
}
else { // check if special character:
String spec = new String(cs, start, length);
String specChar = (String)SPECIAL_CHARS.get(spec);
if(specChar!=null) { // special chars!
buffer = buffer.append(specChar);
// continue loop:
start =length;
continue;
}
else { // check if like '&#1234':
if(next=='#') { // maybe a char
String num = new String(cs, start 2, length-3);
try {
int code = Integer.parseInt(num);
if(code>0 && code<65536) { // this is a special char:
buffer = buffer.append((char)code);
// continue loop:

文章整理:西部数码--专业提供域名注册虚拟主机服务
http://www.west263.com
以上信息与文章正文是不可分割的一部分,如果您要转载本文章,请保留以上信息,谢谢!