2017年02月18日

kuromoji-neologdを使った形態素解析

Lucene3系だった頃と書き方がちょっと変わってたのでメモしておく。

とりあえず下記を参考にdependencyは記述されているものとする。
https://github.com/codelibs/elasticsearch-analysis-kuromoji-neologd

JapaneseTokenizerを利用して形態素解析する。unicode normalize等はFilter等は使わずにベタ書きしている。

import java.io.StringReader
import java.text.Normalizer

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.BaseFormAttribute
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.InflectionAttribute
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.ReadingAttribute

object HtmlParser extends App {

  val value = "今日は晴れておりますね"
  val normValue = Normalizer.normalize(value, Normalizer.Form.NFKC).toLowerCase
  val reader = new StringReader(normValue.toString())
  val stream = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL)
  stream.setReader(reader)
  stream.reset()

  while (stream.incrementToken()) {
    println(s"term=${stream.getAttribute(classOf[CharTermAttribute])}, " +
      s"baseForm=${stream.getAttribute(classOf[BaseFormAttribute]).getBaseForm}, " +
      s"partOfSpeach=${stream.getAttribute(classOf[PartOfSpeechAttribute]).getPartOfSpeech}, " +
      s"reading=${stream.getAttribute(classOf[ReadingAttribute]).getPronunciation}, " +
      s"inflectionForm=${stream.getAttribute(classOf[InflectionAttribute]).getInflectionForm}, " +
      s"inflectionType=${stream.getAttribute(classOf[InflectionAttribute]).getInflectionType}, ")
  }

  stream.close()
}

実行結果

term=今日, baseForm=null, partOfSpeach=名詞-副詞可能, reading=キョー, inflectionForm=null, inflectionType=null, 
term=は, baseForm=null, partOfSpeach=助詞-係助詞, reading=ワ, inflectionForm=null, inflectionType=null, 
term=晴れ, baseForm=晴れる, partOfSpeach=動詞-自立, reading=ハレ, inflectionForm=連用形, inflectionType=一段, 
term=て, baseForm=null, partOfSpeach=助詞-接続助詞, reading=テ, inflectionForm=null, inflectionType=null, 
term=おり, baseForm=おる, partOfSpeach=動詞-非自立, reading=オリ, inflectionForm=連用形, inflectionType=五段・ラ行, 
term=ます, baseForm=null, partOfSpeach=助動詞, reading=マス, inflectionForm=基本形, inflectionType=特殊・マス, 
term=ね, baseForm=null, partOfSpeach=助詞-終助詞, reading=ネ, inflectionForm=null, inflectionType=null,