Lucene3系だった頃と書き方がちょっと変わってたのでメモしておく。
とりあえず下記を参考にdependencyは記述されているものとする。
https://github.com/codelibs/elasticsearch-analysis-kuromoji-neologd
JapaneseTokenizerを利用して形態素解析する。unicode normalize等はFilter等は使わずにベタ書きしている。
import java.io.StringReader
import java.text.Normalizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.BaseFormAttribute
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.InflectionAttribute
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.ReadingAttribute
object HtmlParser extends App {
val value = "今日は晴れておりますね"
val normValue = Normalizer.normalize(value, Normalizer.Form.NFKC).toLowerCase
val reader = new StringReader(normValue.toString())
val stream = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL)
stream.setReader(reader)
stream.reset()
while (stream.incrementToken()) {
println(s"term=${stream.getAttribute(classOf[CharTermAttribute])}, " +
s"baseForm=${stream.getAttribute(classOf[BaseFormAttribute]).getBaseForm}, " +
s"partOfSpeach=${stream.getAttribute(classOf[PartOfSpeechAttribute]).getPartOfSpeech}, " +
s"reading=${stream.getAttribute(classOf[ReadingAttribute]).getPronunciation}, " +
s"inflectionForm=${stream.getAttribute(classOf[InflectionAttribute]).getInflectionForm}, " +
s"inflectionType=${stream.getAttribute(classOf[InflectionAttribute]).getInflectionType}, ")
}
stream.close()
}
実行結果
term=今日, baseForm=null, partOfSpeach=名詞-副詞可能, reading=キョー, inflectionForm=null, inflectionType=null, term=は, baseForm=null, partOfSpeach=助詞-係助詞, reading=ワ, inflectionForm=null, inflectionType=null, term=晴れ, baseForm=晴れる, partOfSpeach=動詞-自立, reading=ハレ, inflectionForm=連用形, inflectionType=一段, term=て, baseForm=null, partOfSpeach=助詞-接続助詞, reading=テ, inflectionForm=null, inflectionType=null, term=おり, baseForm=おる, partOfSpeach=動詞-非自立, reading=オリ, inflectionForm=連用形, inflectionType=五段・ラ行, term=ます, baseForm=null, partOfSpeach=助動詞, reading=マス, inflectionForm=基本形, inflectionType=特殊・マス, term=ね, baseForm=null, partOfSpeach=助詞-終助詞, reading=ネ, inflectionForm=null, inflectionType=null,