Lucene 一个比较准确和高效的分词算法
作者:admin 日期:2006-09-07
词库有三十万 ,加载词库的代码需要改进,索引以后不区分大小写 ,保存token 的位置信息,
后面会陆续发布 Nutch 搜索引擎和Lucene 关键代码 的 解释文章 ,有需要词库的 可以联系。 jaddy0302@126.com
注:以下代码来自 http://www.mandarintools.com/segmenter.html , 本来的代码在使用过程中有一些缺陷,所以我做了一些修改,主要是切分词部分的代码 tokenize() 方法。
package com.xdtech.util.lucene;
import java.lang.*;
import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import java.nio.CharBuffer;

/**//* @author:jaddy0302 线点科技
*/

public class XDChineseTokenizer extends Tokenizer ...{
//private Hashtable zhwords;
private static final int IO_BUFFER_SIZE = 256;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int bufferIndex = 0;
private int dataLen = 0;
private String tokenType = "";
private List tokenList = null;

public XDChineseTokenizer(Reader reader) ...{
this.input = reader;
}
private static TreeMap zhwords;
private static TreeSet csurname, cforeign, cnumbers, cnotname;
private static InputStream worddata = null;
static...{
csurname = new TreeSet();
cforeign = new TreeSet();
cnumbers = new TreeSet();
cnotname = new TreeSet();
zhwords = new TreeMap();
loadset(cnumbers, "data/snumbers_u8.txt");
loadset(cforeign, "data/sforeign_u8.txt");
loadset(csurname, "data/ssurname_u8.txt");
loadset(cnotname, "data/snotname_u8.txt");
loadset(cnumbers, "data/tnumbers_u8.txt");
loadset(cforeign, "data/tforeign_u8.txt");
loadset(csurname, "data/tsurname_u8.txt");
loadset(cnotname, "data/tnotname_u8.txt");
worddata = XDChineseTokenizer.class.getResourceAsStream("bothlexu8.txt");
init();
}
private String debugencoding;
private boolean debug;
// Char form
public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;

/**//*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next() throws IOException ...{
if (tokenList == null || tokenList.size() == 0)
tokenize();
if (tokenList.size() == 0)
return null;
Token token = (Token) tokenList.get(0);
tokenList.remove(0);
return token;
}
private static void init()
...{
int charform;
boolean loadwordfile = true ;
int count = 0;
if (!loadwordfile) ...{
return;
}
String newword = null;
try ...{

BufferedReader in = new BufferedReader(new InputStreamReader(worddata, "UTF8"));
System.out.println("开始载入词库:");
while ((newword = in.readLine()) != null) ...{
if ((newword.indexOf("#") == -1) && (newword.length() < 5)) ...{
zhwords.put(newword.intern(), "1");

if (newword.length() == 3) ...{
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) ...{
zhwords.put(newword.substring(0,2).intern(), "2");
}
}

if (newword.length() == 4) ...{
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) ...{
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) ...{
zhwords.put(newword.substring(0,3).intern(), "2");
}
}

if (count++ % 40000 == 0) ...{ System.err.print(" "+count+" "); }

}
}
System.out.println("载入词库完成");
in.close();
}
catch (IOException e) ...{
System.err.println("IOException: "+e);
}
}


/** *//** Load a set of character data */
private static void loadset(TreeSet targetset, String sourcefile) ...{
String dataline;
try ...{
InputStream setdata = XDChineseTokenizer.class.getResourceAsStream(sourcefile);
BufferedReader in = new BufferedReader(new InputStreamReader(setdata, "UTF-8"));
while ((dataline = in.readLine()) != null) ...{
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) ...{
continue;
}
targetset.add(dataline.intern());
}
in.close();
}
catch (Exception e) ...{
System.err.println("Exception loading data file" + sourcefile + " " + e);
}
}

public boolean isNumber(String testword) ...{
boolean result = true;
for (int i = 0; i < testword.length(); i++) ...{
if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) ...{
result = false;
break;
}
}

if (debug) ...{
try ...{System.out.println(new String(testword.getBytes("UTF-8")) + " " + result);}
catch (Exception a) ...{ };
}
return result;
}

public boolean isAllForeign(String testword) ...{
boolean result = true;
for (int i = 0; i < testword.length(); i++) ...{
if (cforeign.contains(testword.substring(i, i+1).intern()) == false) ...{
result = false;
break;
}
}
return result;
}

public boolean isNotCJK(String testword) ...{
boolean result = true;
for (int i = 0; i < testword.length(); i++) ...{
if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ...{
result = false;
break;
}
}
return result;
}



public String stemWord(String word) ...{
String[] prefix = new String[] ...{"第", "副", "不"};
String[] suffix = new String[] ...{"了", "的", "地", "下", "上", "中", "里",
"到", "内", "外", "们"};
String[] infix = new String[] ...{"得", "不"};
int i;
StringBuffer unstemmed = new StringBuffer(word);

for (i = 0; i < prefix.length; i++) ...{
if (unstemmed.substring(0, 1).equals(prefix[i]) == true &&
(zhwords.get(unstemmed.substring(1, unstemmed.length()).intern()) != null ||
unstemmed.length() == 2)) ...{
System.out.println("Stemmed prefix");
try ...{System.out.println(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) ...{ };
unstemmed.deleteCharAt(0);
return unstemmed.toString();
}
}


for (i = 0; i < suffix.length; i++) ...{
if (unstemmed.substring(unstemmed.length()-1, unstemmed.length()).equals(suffix[i]) == true &&
(zhwords.get(unstemmed.substring(0, unstemmed.length()-1).intern()) != null ||
unstemmed.length() == 2)) ...{
System.out.println("Stemmed suffix");
try ...{System.out.println(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) ...{ };
unstemmed.deleteCharAt(unstemmed.length()-1);
return unstemmed.toString();
}
}

for (i = 0; i < infix.length; i++) ...{
if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
zhwords.get(new String(unstemmed.substring(0, 1) + unstemmed.substring(2, 3)).intern()) != null) ...{
System.out.println("Stemmed infix");
unstemmed.deleteCharAt(1);
return unstemmed.toString();
}
}
return unstemmed.toString();
}


public void tokenize() throws IOException ...{
List tokenCache = new ArrayList();
StringBuffer strb = new StringBuffer() ;

while (true) ...{
char[] charb = new char[256];
if(input.read(charb)>0)
...{
strb.append(charb) ;
}else
...{
break ;
}
}
String cline, separator ;
cline = strb.toString() ;
separator = "" ;
StringBuffer currentword = new StringBuffer();
StringBuffer outline = new StringBuffer();
int clength;
char currentchar;
//separator = " ";
clength = cline.length();
int[][] offsets = new int[clength][2];
boolean isLetter =false ;
for (int i = 0; i < clength; i++) ...{
currentchar = cline.charAt(i);
if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
isNumber(cline.substring(i, i+1)) == true) ...{
if(isLetter)
...{
isLetter = false;
if(currentword.length()>0)
...{
// System.out.println("中文字符:"+currentchar +" 英文单词:"+currentword.toString());
Token token = new Token(currentword.toString(),
outline.toString().length(),
outline.toString().length() +
String.valueOf(currentword).
length(), tokenType);
tokenCache.add(token);
outline.append(currentword.toString());
currentword.setLength(0);
}
}
// Character in CJK block
if (currentword.length() == 0) ...{ // start looking for next word
//System.err.println("current word length 0");
if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false)) ...{
outline.append(separator);
}
currentword.append(currentchar);
if (debug) ...{
try ...{System.out.println(new String(currentword.toString().getBytes(debugencoding)));} catch (Exception a) ...{ };
}

} else ...{
if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true) ...{
// word is in lexicon
currentword.append(currentchar);
if (debug) ...{
try ...{System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) ...{ };
}
} else if (isAllForeign(currentword.toString()) &&
cforeign.contains(new String(new char[] ...{currentchar}).intern()) &&
i + 2 < clength &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false)) ...{
// Possible a transliteration of a foreign name
currentword.append(currentchar);
if (debug) ...{
try ...{System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) ...{ };
}
} else if (isNumber(currentword.toString()) &&
cnumbers.contains(new String(new char[] ...{currentchar}).intern())
/**//* && (i + 2 < clength) &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ ) ...{
// Put all consecutive number characters together
currentword.append(currentchar);
if (debug) ...{
try ...{System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) ...{ };
}
} else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
(((String)(zhwords.get(new String(currentword.toString() +
currentchar).intern()))).equals("2") == true) &&
i + 1 < clength &&
(zhwords.containsKey(new String(currentword.toString() + currentchar +
cline.charAt(i+1)).intern()) == true))
...{
if (debug) ...{
try ...{System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) ...{ };
}
// Starts a word in the lexicon
currentword.append(currentchar);

} else ...{ // Start anew
if (debug) ...{
try ...{System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) ...{ };
}


Token token = new Token(currentword.toString(), outline.toString().length(),
outline.toString().length()+String.valueOf(currentword).length(), tokenType);
tokenCache.add(token);
outline.append(currentword.toString());
if (Character.isWhitespace(currentchar) == false) ...{
outline.append(separator);
}
currentword.setLength(0);
currentword.append(currentchar);
}
}

} else ...{ // Not chinese character
//System.err.println("not cjk");
if(!isLetter)
...{
// System.out.println("英文字符:"+currentchar +" 中文单词:"+currentword.toString());
if (currentword.length() > 0)
...{
Token token = new Token(currentword.toString(),
outline.toString().length(),
outline.toString().length() +
String.valueOf(currentword).length(), tokenType);
tokenCache.add(token);
outline.append(currentword.toString());
currentword.setLength(0);
outline.append(separator);
}
}
isLetter = true ;
if(Character.isLetterOrDigit(currentchar))
...{
currentword.append(currentchar);
}
//System.out.println("字符:"+currentchar +" 单词:"+currentword.toString());
if (currentword.length() > 0) ...{
if (Character.isLetterOrDigit(currentchar) == false) ...{
// System.out.println("英文字符:" + currentchar + " 英文单词:" +currentword.toString());
Token token = new Token(currentword.toString(),
outline.toString().length(),
outline.toString().length() +
String.valueOf(currentword).
length(), tokenType);
tokenCache.add(token);
outline.append(currentword.toString());
outline.append(separator);
currentword.setLength(0);
}
}
if(Character.isLetterOrDigit(currentchar)==false)
...{
outline.append(currentchar);
}
}
}
outline.append(currentword.toString());
tokenList = new ArrayList();
Token token = null ;
for(int i = 0; i < tokenCache.size(); ++i)...{
token = (Token) tokenCache.get(i);
tokenList.add(token);
}
// System.out.println("Clength:"+outline);
}



public static void main(String[] argv) ...{
int charform = XDChineseTokenizer.TRAD;
// BufferReader bu = new Buffer
String temp = "scientists 中间商CP100测试Nice情况 have revealed the 10 commandments for a long and happy life. and the rules even allow for sunbathing, drinking alcohol and eating chocolate. the list was drawn up after experts trawled medical studies published over the past 50 years examining why people are living longer. the first rule, as published in the new scientist magazine, is to enjoy yourself. married second on the list is remaining sociable, with a happy marriage and good family life being essential for health. studies have shown that marriage can add as much as seven years to a man's life and two years to a woman's life. where you live also has a huge effect on your health. the world has many 'longevity' hotspots where the number of centenarians exceeds 10 in 100,000. hawaii, sardinia, nova scotia and japan are amongst them. a drink so-called vices like wine, partying and chocolate provide the fourth rule, with countless studies showing that a little of what you fancy does you good rather than harm. work your brain the fifth rule says that you should exercise your brain to stay active. last month scientists at cambridge university said puzzles could help ward off a range of conditions, from depression to schizophrenia. knitting, doing crosswords or just walking also help. see the doctor it's simple really. if you're sick, get treatment fast. eat healthily the seventh rule is well-known - you are what you eat. scientists have recommended that the low-fat, high-fibre mediterranean diet is a model for healthy eating and a long life. a recent study found that the hearts of those who had followed the diet appeared 15 years healthier than those of volunteers of a similar age. high in fruit and vegetables, the diet also uses beans, breads and cereals. small amounts of meat and moderate amounts of fish are also eaten. take risks rule eight urges people to put more excitement into their lives. intellectual challenges, travelling, or learning a new language can all add years to your life. embrace technology rule nine advises those seeking a long life to embrace new technologies. be happy merely living longer is not worth doing unless it is an enjoyable experience. rule ten is that you should smile and feel happier in life. research from the netherlands showed that older men with an optimistic outlook on life were only half as likely to suffer from cardio-vascular disease as those whose world view was more negative. 最近,科学家公布了幸福长寿的十大法则。这其中甚至包括沐日光浴、饮酒和吃巧克力。 科学家们对近50年来发表的“长寿”医学研究进行了搜集和整理,最终列出了这十大长寿秘诀。 这一研究结果被刊登在《新科学杂志》上,“快乐生活”成为长寿的首要法则。 婚姻生活 长寿的第二大法则是与人交往,幸福的婚姻和和谐的家庭是健康的基本保证。 研究发现,成功的婚姻可以让男性多活七年,女性多活两年。 居住环境对健康有重大影响 世界上有很多地方被认为是长寿之乡。在这些地区,每10万人中就有10位以上是百岁老人,如美国的夏威夷,意大利的撒丁岛,加拿大的新斯科舍和日本。 饮酒 所谓的生活陋习,如饮酒、参加社交聚会和吃巧克力,成为保证长寿的第四大法则。众多研究表明,稍稍满足一下这些嗜好,不仅对身体无害,反而有益于健康。 动脑 长寿的第五大法则是勤于动脑,让大脑时时处于积极思维状态。 剑桥大学的科学家上个月宣布,一些智力玩具可以帮助人们抵御一系列心理或生理疾病,如忧郁症和精神分裂症。 编织、猜字游戏或散步也有助于健康。 就诊 这条实在简单。生病了,要赶快医治。 饮食 长寿的第七条法则大家众所周知--饮食决定健康。 科学家推荐:低脂肪、高纤维的地中海式饮食习惯的是健康饮食和长寿的典范。 一份最新研究发现,饮食习惯健康的人的心脏比其他自愿接受调查的同龄人年轻15年。 健康饮食指多食用水果和蔬菜,同时,也要摄取大豆、面包和谷类食品。 另外,要食用少量的猪肉、牛肉和羊肉以及适量的鱼肉。 冒险 第八条法则鼓励人们增添生活情趣。 智力挑战、旅游或学习一门新语言有助于人们长寿。 接受新技术 第九条长寿法则建议人们不断接受新技术。 快乐 如果生活中无乐趣可言,单一的长寿也毫无意义。 第十条长寿法则建议人们微笑,更加快乐的面对生活。 荷兰一份研究表明,与悲观的老人相比,持乐观生活态度的老人患心血管疾病的几率要减少一半";
System.out.println(" Length:"+temp);
java.io.StringReader reader = new StringReader(temp) ;
XDChineseTokenizer mainsegmenter = new XDChineseTokenizer(reader);
try ...{
for (Token token = mainsegmenter.next(); token != null;
token = mainsegmenter.next()) ...{
// System.out.println(" ceshi:"+token.termText()+" 位置信息,x:"+token.startOffset()+" ;y:"+token.endOffset());
}


} catch (IOException ex) ...{
ex.printStackTrace();
}
// mainsegmenter
// System.out.println( mainsegmenter("", " ") );
}
}




Tags: Lucene 分词
北京线点科技 致力于以数据和搜索为核心的业务 (全文检索、舆情监控、搜索引擎产品)http://www.xd-tech.com
- 1






