java获取字符串编码函数 / iVAN

encoding.java

package org.loon.test.encoding;

/** *//**
* 
* title: loonframework
* 
* 
* description:编码基本类型集合
* 
* 
* copyright: copyright (c) 2008
* 
* 
* company: loonframework
* 
* 
* license: http://www.apache.org/licenses/license-2.0
* 
*
* @author chenpeng
* @email：ceponline@yahoo.com.cn
* @version 0.1
*/
public class encoding ...{

 // 支持的字符格式
 public static int gb2312 = 0;

 public static int gbk = 1;

 public static int big5 = 2;

 public static int utf8 = 3;

 public static int unicode = 4;

 public static int euc_kr = 5;

 public static int sjis = 6;

 public static int euc_jp = 7;

 public static int ascii = 8;

 public static int unknown = 9;

 public static int totalt = 10;

 public final static int simp = 0;

 public final static int trad = 1;

 // 解析名称用
 public static string[] javaname;

 // 编码用
 public static string[] nicename;

 // 应用于html中的字符集
 public static string[] htmlname;

 public encoding() ...{
 javaname = new string[totalt];
 nicename = new string[totalt];
 htmlname = new string[totalt];
 javaname[gb2312] = "gb2312";
 javaname[gbk] = "gbk";
 javaname[big5] = "big5";
 javaname[utf8] = "utf8";
 javaname[unicode] = "unicode";
 javaname[euc_kr] = "euc_kr";
 javaname[sjis] = "sjis";
 javaname[euc_jp] = "euc_jp";
 javaname[ascii] = "ascii";
 javaname[unknown] = "iso8859_1";

 // 分配编码名称
 htmlname[gb2312] = "gb2312";
 htmlname[gbk] = "gbk";
 htmlname[big5] = "big5";
 htmlname[utf8] = "utf-8";
 htmlname[unicode] = "utf-16";
 htmlname[euc_kr] = "euc-kr";
 htmlname[sjis] = "shift_jis";
 htmlname[euc_jp] = "euc-jp";
 htmlname[ascii] = "ascii";
 htmlname[unknown] = "iso8859-1";

 // 分配可读名称
 nicename[gb2312] = "gb-2312";
 nicename[gbk] = "gbk";
 nicename[big5] = "big5";
 nicename[utf8] = "utf-8";
 nicename[unicode] = "unicode";
 nicename[euc_kr] = "euc-kr";
 nicename[sjis] = "shift-jis";
 nicename[euc_jp] = "euc-jp";
 nicename[ascii] = "ascii";
 nicename[unknown] = "unknown";

 }

 public string toencoding(final int type) ...{
 return (javaname[type] + "," + nicename[type] + "," + htmlname[type])
 .intern();
 }

}

encode,java（省略，见源码）

parseencoding.java

package org.loon.test.encoding;

import java.io.bytearrayoutputstream;
import java.io.file;
import java.io.fileinputstream;
import java.io.filenotfoundexception;
import java.io.ioexception;
import java.io.inputstream;
import java.net.malformedurlexception;
import java.net.url;

/** *//**
* 
* title: loonframework
* 
* 
* description:
* 
* 
* copyright: copyright (c) 2008
* 
* 
* company: loonframework
* 
* 
* license: http://www.apache.org/licenses/license-2.0
* 
*
* @author chenpeng
* @email：ceponline@yahoo.com.cn
* @version 0.1
*/
public class parseencoding extends encode ...{

 public parseencoding() ...{
 super();
 gb2312format = new int[94][94];
 gbkformat = new int[126][191];
 big5format = new int[94][158];
 euc_krformat = new int[94][94];
 jpformat = new int[94][94];

 // 初始化编码格式
 init();
 }

 public string getencoding(final string path) ...{
 return check(getencodevalue(path));
 }

 public string getencoding(final inputstream in) ...{
 return check(getencodevalue(in));
 }

 public string getencoding(final byte[] buffer) ...{
 return check(getencodevalue(buffer));
 }

 public string getencoding(final url url) ...{
 return check(getencodevalue(url));
 }

 private string check(final int result) ...{
 if (result == -1) ...{
 return nicename[unknown];
 }
 return nicename[result];
 }

 /** *//**
 * 解析指定字符串路径编码所用格式
 *
 * @param path
 * @return
 */
 private int getencodevalue(string path) ...{
 int express = unknown;
 if (path.startswith("http://")) ...{
 try ...{
 express = getencodevalue(new url(path));
 } catch (malformedurlexception e) ...{
 express = -1;
 }
 } else ...{
 express = getencodevalue(new file(path));
 }
 return express;
 }

 /** *//**
 *
 * 解析指定inputstream所用编码，返回或然率最高的编码类型数值
 *
 * @param in
 * @return
 */
 public int getencodevalue(inputstream in) ...{
 byte[] rawtext = new byte[8192];
 int bytesread = 0, byteoffset = 0;
 int express = unknown;
 inputstream stream = in;
 try ...{
 while ((bytesread = stream.read(rawtext, byteoffset, rawtext.length
 - byteoffset)) > 0) ...{
 byteoffset += bytesread;
 }
 ;
 stream.close();
 express = getencodevalue(rawtext);
 } catch (exception e) ...{
 express = -1;
 }
 return express;
 }

 /** *//**
 * 解析指定url下数据所用编码，返回或然率最高的编码类型数值
 *
 * @param url
 * @return
 */
 public int getencodevalue(url url) ...{

 inputstream stream;
 try ...{
 stream = url.openstream();
 } catch (ioexception e) ...{
 stream = null;
 }

 return getencodevalue(stream);
 }

 /** *//**
 * 解析指定file所用编码，返回或然率最高的编码类型数值
 *
 * @param file
 * @return
 */
 public int getencodevalue(file file) ...{
 byte[] buffer;
 try ...{
 buffer = read(new fileinputstream(file));
 } catch (filenotfoundexception e) ...{
 buffer = null;
 }
 return getencodevalue(buffer);
 }

 /** *//**
 * 将inputstream转为byte[]
 *
 * @param inputstream
 * @return
 */
 private final byte[] read(final inputstream inputstream) ...{
 byte[] arraybyte = null;
 bytearrayoutputstream bytearrayoutputstream = new bytearrayoutputstream();
 byte[] bytes = new byte[8192];
 try ...{
 bytes = new byte[inputstream.available()];
 int read;
 while ((read = inputstream.read(bytes)) >= 0) ...{
 bytearrayoutputstream.write(bytes, 0, read);
 }
 arraybyte = bytearrayoutputstream.tobytearray();
 } catch (ioexception e) ...{
 return null;
 }
 return arraybyte;
 }

 /** *//**
 * 解析指定byte[]所用编码，返回或然率最高的数值类型
 *
 * @param content
 * @return
 */
 public int getencodevalue(byte[] content) ...{
 if (content == null)
 return -1;
 int[] scores;
 int index, maxscore = 0;
 int encoding = unknown;
 scores = new int[totalt];
 // 分配或然率
 scores[gb2312] = gb2312probability(content);
 scores[gbk] = gbkprobability(content);
 scores[big5] = big5probability(content);
 scores[utf8] = utf8probability(content);
 scores[unicode] = utf16probability(content);
 scores[euc_kr] = euc_krprobability(content);
 scores[ascii] = asciiprobability(content);
 scores[sjis] = sjisprobability(content);
 scores[euc_jp] = euc_jpprobability(content);
 scores[unknown] = 0;

 // 概率比较
 for (index = 0; index < totalt; index++) ...{
 if (scores[index] > maxscore) ...{
 // 索引
 encoding = index;
 // 最大几率
 maxscore = scores[index];
 }
 }
 // 返回或然率大于50%的数据
 if (maxscore <= 50) ...{
 encoding = unknown;
 }
 return encoding;
 }

 /** *//**
 * gb2312数据或然率计算
 *
 * @param content
 * @return
 */
 private int gb2312probability(byte[] content) ...{
 int i, rawtextlen = 0;

 int dbchars = 1, gbchars = 1;
 long gbformat = 0, totalformat = 1;
 float rangeval = 0, formatval = 0;
 int row, column;

 // 检查是否在亚洲汉字范围内
 rawtextlen = content.length;
 for (i = 0; i < rawtextlen - 1; i++) ...{
 if (content[i] >= 0) ...{
 } else ...{
 dbchars++;
 // 汉字gb码由两个字节组成，每个字节的范围是0xa1 ~ 0xfe
 if ((byte) 0xa1 <= content[i] && content[i] <= (byte) 0xf7
 && (byte) 0xa1 <= content[i + 1]
 && content[i + 1] <= (byte) 0xfe) ...{
 gbchars++;
 totalformat += 500;
 row = content[i] + 256 - 0xa1;
 column = content[i + 1] + 256 - 0xa1;
 if (gb2312format[row][column] != 0) ...{
 gbformat += gb2312format[row][column];
 } else if (15 <= row && row < 55) ...{
 // 在gb编码范围
 gbformat += 200;
 }

 }
 i++;
 }
 }
 rangeval = 50 * ((float) gbchars / (float) dbchars);
 formatval = 50 * ((float) gbformat / (float) totalformat);

 return (int) (rangeval + formatval);
 }

 /** *//**
 * gb2312或然率计算
 *
 * @param content
 * @return
 */
 private int gbkprobability(byte[] content) ...{
 int i, rawtextlen = 0;

 int dbchars = 1, gbchars = 1;
 long gbformat = 0, totalformat = 1;
 float rangeval = 0, formatval = 0;
 int row, column;
 rawtextlen = content.length;
 for (i = 0; i < rawtextlen - 1; i++) ...{
 if (content[i] >= 0) ...{
 } else ...{
 dbchars++;
 if ((byte) 0xa1 <= content[i] && content[i] <= (byte) 0xf7
 && // gb范围
 (byte) 0xa1 <= content[i + 1]
 && content[i + 1] <= (byte) 0xfe) ...{
 gbchars++;
 totalformat += 500;
 row = content[i] + 256 - 0xa1;
 column = content[i + 1] + 256 - 0xa1;
 if (gb2312format[row][column] != 0) ...{
 gbformat += gb2312format[row][column];
 } else if (15 <= row && row < 55) ...{
 gbformat += 200;
 }

 } else if ((byte) 0x81 <= content[i]
 && content[i] <= (byte) 0xfe && // gb扩展区域
 (((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xfe) || ((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7e))) ...{
 gbchars++;
 totalformat += 500;
 row = content[i] + 256 - 0x81;
 if (0x40 <= content[i + 1] && content[i + 1] <= 0x7e) ...{
 column = content[i + 1] - 0x40;
 } else ...{
 column = content[i + 1] + 256 - 0x40;
 }
 if (gbkformat[row][column] != 0) ...{
 gbformat += gbkformat[row][column];
 }
 }
 i++;
 }
 }
 rangeval = 50 * ((float) gbchars / (float) dbchars);
 formatval = 50 * ((float) gbformat / (float) totalformat);
 return (int) (rangeval + formatval) - 1;
 }

 /** *//**
 * 解析为big5的或然率
 *
 * @param content
 * @return
 */
 private int big5probability(byte[] content) ...{
 int i, rawtextlen = 0;
 int dbchars = 1, bfchars = 1;
 float rangeval = 0, formatval = 0;
 long bfformat = 0, totalformat = 1;
 int row, column;
 rawtextlen = content.length;
 for (i = 0; i < rawtextlen - 1; i++) ...{
 if (content[i] >= 0) ...{
 } else ...{
 dbchars++;
 if ((byte) 0xa1 <= content[i]
 && content[i] <= (byte) 0xf9
 && (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7e) || ((byte) 0xa1 <= content[i + 1] && content[i + 1] <= (byte) 0xfe))) ...{
 bfchars++;
 totalformat += 500;
 row = content[i] + 256 - 0xa1;
 if (0x40 <= content[i + 1] && content[i + 1] <= 0x7e) ...{
 column = content[i + 1] - 0x40;
 } else ...{
 column = content[i + 1] + 256 - 0x61;
 }
 if (big5format[row][column] != 0) ...{
 bfformat += big5format[row][column];
 } else if (3 <= row && row <= 37) ...{
 bfformat += 200;
 }
 }
 i++;
 }
 }
 rangeval = 50 * ((float) bfchars / (float) dbchars);
 formatval = 50 * ((float) bfformat / (float) totalformat);

 return (int) (rangeval + formatval);
 }

 /** *//**
 * 在utf-8中的或然率
 *
 * @param content
 * @return
 */
 private int utf8probability(byte[] content) ...{
 int score = 0;
 int i, rawtextlen = 0;
 int goodbytes = 0, asciibytes = 0;
 // 检查是否为汉字可接受范围
 rawtextlen = content.length;
 for (i = 0; i < rawtextlen; i++) ...{
 if ((content[i] & (byte) 0x7f) == content[i]) ...{
 asciibytes++;
 } else if (-64 <= content[i] && content[i] <= -33
 && i + 1 < rawtextlen && -128 <= content[i + 1]
 && content[i + 1] <= -65) ...{
 goodbytes += 2;
 i++;
 } else if (-32 <= content[i] && content[i] <= -17
 && i + 2 < rawtextlen && -128 <= content[i + 1]
 && content[i + 1] <= -65 && -128 <= content[i + 2]
 && content[i + 2] <= -65) ...{
 goodbytes += 3;
 i += 2;
 }
 }

 if (asciibytes == rawtextlen) ...{
 return 0;
 }

 score = (int) (100 * ((float) goodbytes / (float) (rawtextlen - asciibytes)));
 // 如果不高于98则减少到零
 if (score > 98) ...{
 return score;
 } else if (score > 95 && goodbytes > 30) ...{
 return score;
 } else ...{
 return 0;
 }

 }

 /** *//**
 * 检查为utf-16的或然率
 *
 * @param content
 * @return
 */
 private int utf16probability(byte[] content) ...{

 if (content.length > 1
 && ((byte) 0xfe == content[0] && (byte) 0xff == content[1])
 || ((byte) 0xff == content[0] && (byte) 0xfe == content[1])) ...{
 return 100;
 }
 return 0;
 }

 /** *//**
 * 检查为ascii的或然率
 *
 * @param content
 * @return
 */
 private int asciiprobability(byte[] content) ...{
 int score = 75;
 int i, rawtextlen;

 rawtextlen = content.length;

 for (i = 0; i < rawtextlen; i++) ...{
 if (content[i] < 0) ...{
 score = score - 5;
 } else if (content[i] == (byte) 0x1b) ...{ // esc (used by iso 2022)
 score = score - 5;
 }
 if (score <= 0) ...{
 return 0;
 }
 }
 return score;
 }

 /** *//**
 * 检查为euc_kr的或然率
 *
 * @param content
 * @return
 */
 private int euc_krprobability(byte[] content) ...{
 int i, rawtextlen = 0;

 int dbchars = 1, krchars = 1;
 long krformat = 0, totalformat = 1;
 float rangeval = 0, formatval = 0;
 int row, column;
 rawtextlen = content.length;
 for (i = 0; i < rawtextlen - 1; i++) ...{
 if (content[i] >= 0) ...{
 } else ...{
 dbchars++;
 if ((byte) 0xa1 <= content[i] && content[i] <= (byte) 0xfe
 && (byte) 0xa1 <= content[i + 1]
 && content[i + 1] <= (byte) 0xfe) ...{
 krchars++;
 totalformat += 500;
 row = content[i] + 256 - 0xa1;
 column = content[i + 1] + 256 - 0xa1;
 if (euc_krformat[row][column] != 0) ...{
 krformat += euc_krformat[row][column];
 } else if (15 <= row && row < 55) ...{
 krformat += 0;
 }

 }
 i++;
 }
 }
 rangeval = 50 * ((float) krchars / (float) dbchars);
 formatval = 50 * ((float) krformat / (float) totalformat);

 return (int) (rangeval + formatval);
 }

 private int euc_jpprobability(byte[] content) ...{
 int i, rawtextlen = 0;

 int dbchars = 1, jpchars = 1;
 long jpformat = 0, totalformat = 1;
 float rangeval = 0, formatval = 0;
 int row, column;

 rawtextlen = content.length;
 for (i = 0; i < rawtextlen - 1; i++) ...{
 if (content[i] >= 0) ...{
 } else ...{
 dbchars++;
 if ((byte) 0xa1 <= content[i] && content[i] <= (byte) 0xfe
 && (byte) 0xa1 <= content[i + 1]
 && content[i + 1] <= (byte) 0xfe) ...{
 jpchars++;
 totalformat += 500;
 row = content[i] + 256 - 0xa1;
 column = content[i + 1] + 256 - 0xa1;
 if (jpformat[row][column] != 0) ...{
 jpformat += jpformat[row][column];
 } else if (15 <= row && row < 55) ...{
 jpformat += 0;
 }

 }
 i++;
 }
 }
 rangeval = 50 * ((float) jpchars / (float) dbchars);
 formatval = 50 * ((float) jpformat / (float) totalformat);

 return (int) (rangeval + formatval);
 }

 private int sjisprobability(byte[] content) ...{
 int i, rawtextlen = 0;

 int dbchars = 1, jpchars = 1;
 long jpformat = 0, totalformat = 1;
 float rangeval = 0, formatval = 0;
 int row, column, adjust;

 rawtextlen = content.length;
 for (i = 0; i < rawtextlen - 1; i++) ...{
 if (content[i] >= 0) ...{
 } else ...{
 dbchars++;
 if (i + 1 < content.length
 && (((byte) 0x81 <= content[i] && content[i] <= (byte) 0x9f) || ((byte) 0xe0 <= content[i] && content[i] <= (byte) 0xef))
 && (((byte) 0x40 <= content[i + 1] && content[i + 1] <= (byte) 0x7e) || ((byte) 0x80 <= content[i + 1] && content[i + 1] <= (byte) 0xfc))) ...{
 jpchars++;
 totalformat += 500;
 row = content[i] + 256;
 column = content[i + 1] + 256;
 if (column < 0x9f) ...{
 adjust = 1;
 if (column > 0x7f) ...{
 column -= 0x20;
 } else ...{
 column -= 0x19;
 }
 } else ...{
 adjust = 0;
 column -= 0x7e;
 }
 if (row < 0xa0) ...{
 row = ((row - 0x70) << 1) - adjust;
 } else ...{
 row = ((row - 0xb0) << 1) - adjust;
 }

 row -= 0x20;
 column = 0x20;
 if (row < jpformat.length && column < jpformat[row].length
 && jpformat[row][column] != 0) ...{
 jpformat += jpformat[row][column];
 }
 i++;
 } else if ((byte) 0xa1 <= content[i]
 && content[i] <= (byte) 0xdf) ...{
 }

 }
 }
 rangeval = 50 * ((float) jpchars / (float) dbchars);
 formatval = 50 * ((float) jpformat / (float) totalformat);

 return (int) (rangeval + formatval) - 1;
 }

}

encodingtest.java

package org.loon.test.encoding;
/** *//**
* title: loonframework
* description:
* copyright: copyright (c) 2008
* company: loonframework
* license: http://www.apache.org/licenses/license-2.0
* @author chenpeng
* @email：ceponline@yahoo.com.cn
* @version 0.1
*/
public class encodingtest ...{
 public static void main(string argc[]) ...{
 parseencoding parse;

 parse = new parseencoding();

 system.out.println("中国大陆：");
 system.out.println("测试字符串，编码格式="+parse.getencoding("百度".getbytes()));
 system.out.println("测试站点，编码格式="+parse.getencoding("http://www.111cn.net"));
 system.out.println();
 system.out.println("中国台湾：");
 system.out.println("测试字符串，编码格式="+parse.getencoding("".getbytes()));
 system.out.println("测试站点，编码格式="+parse.getencoding("http://tw.yahoo.com/"));
 system.out.println("测试站点(繁体字，utf编码)，编码格式="+parse.getencoding("http://.tw/jute"));
 system.out.println();
 system.out.println("日本：");
 system.out.println("测试字符串，编码格式="+parse.getencoding("".getbytes()));
 system.out.println("测试站点，编码格式="+parse.getencoding("http://www.111cn.net"));
 system.out.println();
 system.out.println("自称蚩尤后代那群……：");
 system.out.println("测试站点，编码格式="+parse.getencoding("http://www.easyjava.co.kr/"));

 }
}

时间： 2024-12-17 23:16:54

java获取字符串编码函数

java获取字符串编码函数的相关文章

php自动获取字符串编码函数mb_detect_encoding_php技巧

oracle获取字符串长度函数length()和hengthb()

escape() 字符串编码函数及其它js 编码函数

Java获取网页编码

我的Java开发学习之旅------&gt;工具类：Java获取字符串和文件进行MD5值

js中字符串编码函数escape()、encodeURI()、encodeURIComponent()区别详解_javascript技巧

mysql获取字符串长度函数(CHAR_LENGTH)_Mysql

mysql获取字符串长度函数(CHAR_LENGTH)

php判断字符串编码函数