Java源码示例:com.chenlb.mmseg4j.Word

示例1
@Override
public boolean incrementToken() throws IOException {
    clearAttributes();
    Word word = mmSeg.get().next();
    if (word != null) {
        // lucene 3.0
        // termAtt.setTermBuffer(word.getSen(), word.getWordOffset(), word.getLength());
        // lucene 3.1
        termAttribute.copyBuffer(word.getSen(), word.getWordOffset(), word.getLength());
        offsetAttribute.setOffset(word.getStartOffset(), word.getEndOffset());
        typeAttribute.setType(word.getType());
        return true;
    } else {
        end();
        return false;
    }
}
 
示例2
@Override
public Iterable<MmsegToken> tokenize(CharSequence text) {
    try {
        mmSeg.reset(new StringReader(text.toString()));
        LinkedList<Word> iterator = new LinkedList<>();
        while (true) {
            Word word = mmSeg.next();
            if (word != null) {
                iterator.add(word);
            } else {
                break;
            }
        }
        MmsegToken iterable = new MmsegToken(iterator.iterator());
        return iterable;
    } catch (Exception exception) {
        throw new RuntimeException(exception);
    }
}
 
示例3
@Override
public List<Term> segment(String sentence) {
    mmSeg.reset(new StringReader(sentence));
    Word word = null;
    List<Term> terms = new ArrayList<>();
    try {
        while ((word = mmSeg.next()) != null) {
            if (word != null) {
                terms.add(new Term(word.getString()));
            }
        }
    }
    catch (IOException e) {
        System.out.println(sentence);
        e.printStackTrace();
    }
    return terms;
}
 
示例4
public String segWords(Reader input, String wordSpilt) throws IOException {
	StringBuilder sb = new StringBuilder();
	Seg seg = getSeg();	//取得不同的分词具体算法
	MMSeg mmSeg = new MMSeg(input, seg);
	Word word = null;
	boolean first = true;
	while((word=mmSeg.next())!=null) {
		if(!first) {
			sb.append(wordSpilt);
		}
		String w = word.getString();
		sb.append(w);
		first = false;
		
	}
	return sb.toString();
}
 
示例5
@Override
public Set<String> segment(String text) {
    Set<String> result = InsertionOrderUtil.newSet();
    MMSeg mmSeg = new MMSeg(new StringReader(text), this.seg);
    try {
        Word word = null;
        while ((word = mmSeg.next()) != null) {
            result.add(word.getString());
        }
    } catch (Exception e) {
        throw new HugeException("MMSeg4j segment text '%s' failed",
                                e, text);
    }
    return result;
}
 
示例6
@Override
public final boolean incrementToken() throws IOException {
	clearAttributes();
	Word word = mmSeg.next();
	if(word != null) {
		termAtt.copyBuffer(word.getSen(), word.getWordOffset(), word.getLength());
		offsetAtt.setOffset(word.getStartOffset(), word.getEndOffset());
		typeAtt.setType(word.getType());
		return true;
	} else {
		return false;
	}
}
 
示例7
private void addToken(PackedTokenAttributeImpl oriToken, int termBufferOffset, int termBufferLength, byte type) {
	PackedTokenAttributeImpl token = TokenUtils.subToken(oriToken, termBufferOffset, termBufferLength);

	if(type == Character.DECIMAL_DIGIT_NUMBER) {
		token.setType(Word.TYPE_DIGIT);
	} else {
		token.setType(Word.TYPE_LETTER);
	}

	tokenQueue.offer(token);
}
 
示例8
public static List<String> toMMsegWords(String txt, Seg seg) {
	List<String> words = new ArrayList<String>();
	MMSeg mmSeg = new MMSeg(new StringReader(txt), seg);
	Word word = null;
	try {
		while ((word = mmSeg.next()) != null) {
			String w = word.getString();
			words.add(w);
		}
	} catch (IOException e) {
		e.printStackTrace();
	}
	return words;
}
 
示例9
public MmsegToken(Iterator<Word> iterator) {
    this.iterator = iterator;
}