From eebfed74341bccea26744083fbc3056e78f61da9 Mon Sep 17 00:00:00 2001 From: Itsuki Toyota Date: Mon, 12 Feb 2018 17:44:20 +0900 Subject: [PATCH 1/2] Introduce mecab-ipadic-neologd --- README.md | 39 +++++++++++++++++++++++++++++++++++++++ lib/MeCab.pm6 | 36 ++++++++++++++++++++++++++++++++++++ lib/MeCab/Model.pm6 | 40 ++++++++++++++++++++++++++++++++++------ lib/MeCab/Tagger.pm6 | 38 ++++++++++++++++++++++++++++++++++---- 4 files changed, 143 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d8bcae2..27476dc 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,44 @@ MeCab depends on the following: Once the build starts, it automatically downloads `mecab-0.996` and `mecab-ipadic-2.7.0-20070801` with `wget` and installs these stuffs under the `$HOME/.p6mecab` directory, where `$HOME` is your home directory. +Use 3rd-party dictionary +======================== + +mecab-ipadic-neologd +-------------------- + + * Step1: download and install neologd + + $ git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git + $ cd mecab-ipadic-neologd + $ export PATH=$HOME/.p6mecab/bin:$PATH + $ ./bin/install-mecab-ipadic-neologd -p $HOME/.p6mecab/lib/mecab/dic/ipadic-neologd + + * Step2: Use .new(:dicdir(PATH_TO_THE_DIR)) + +An example for MeCab::Tagger: + + use MeCab; + use MeCab::Tagger; + + my Str $text = "トランプ大統領 ワシントンで大規模軍事パレードを指示"; + my $mecab-tagger = MeCab::Tagger.new(:dicdir("$*HOME/.p6mecab/lib/mecab/dic/ipadic-neologd")); + loop ( my MeCab::Node $node = $mecab-tagger.parse-tonode($text); $node; $node = $node.next ) { + say ($node.surface, $node.feature).join("\t"); + } + + # OUTPUT« + # BOS/EOS,*,*,*,*,*,*,*,* + # トランプ大統領 名詞,固有名詞,人名,一般,*,*,ドナルド・トランプ,トランプダイトウリョウ,トランプダイトウリョー + # ワシントン 名詞,固有名詞,地域,一般,*,*,ワシントン,ワシントン,ワシントン + # で 助詞,格助詞,一般,*,*,*,で,デ,デ + # 大規模 名詞,一般,*,*,*,*,大規模,ダイキボ,ダイキボ + # 軍事パレード 名詞,固有名詞,一般,*,*,*,軍事パレード,グンジパレード,グンジパレード + # を 助詞,格助詞,一般,*,*,*,を,ヲ,ヲ + # 指示 名詞,サ変接続,*,*,*,*,指示,シジ,シジ + # BOS/EOS,*,*,*,*,*,*,*,* + # » + AUTHOR ====== @@ -99,3 +137,4 @@ Copyright 2016 titsuki libmecab ( http://taku910.github.io/mecab/ ) by Taku Kudo is licensed under the GPL, LGPL or BSD Licenses. This library is free software; you can redistribute it and/or modify it under the Artistic License 2.0. + diff --git a/lib/MeCab.pm6 b/lib/MeCab.pm6 index 66d5823..b0dd9c1 100644 --- a/lib/MeCab.pm6 +++ b/lib/MeCab.pm6 @@ -141,6 +141,42 @@ MeCab depends on the following: Once the build starts, it automatically downloads C and C with C and installs these stuffs under the C<$HOME/.p6mecab> directory, where C<$HOME> is your home directory. +=head1 Use 3rd-party dictionary + +=head2 mecab-ipadic-neologd + +=item1 Step1: download and install neologd + + $ git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git + $ cd mecab-ipadic-neologd + $ export PATH=$HOME/.p6mecab/bin:$PATH + $ ./bin/install-mecab-ipadic-neologd -p $HOME/.p6mecab/lib/mecab/dic/ipadic-neologd + +=item1 Step2: Use .new(:dicdir(PATH_TO_THE_DIR)) + +An example for MeCab::Tagger: + + use MeCab; + use MeCab::Tagger; + + my Str $text = "トランプ大統領 ワシントンで大規模軍事パレードを指示"; + my $mecab-tagger = MeCab::Tagger.new(:dicdir("$*HOME/.p6mecab/lib/mecab/dic/ipadic-neologd")); + loop ( my MeCab::Node $node = $mecab-tagger.parse-tonode($text); $node; $node = $node.next ) { + say ($node.surface, $node.feature).join("\t"); + } + + # OUTPUT« + # BOS/EOS,*,*,*,*,*,*,*,* + # トランプ大統領 名詞,固有名詞,人名,一般,*,*,ドナルド・トランプ,トランプダイトウリョウ,トランプダイトウリョー + # ワシントン 名詞,固有名詞,地域,一般,*,*,ワシントン,ワシントン,ワシントン + # で 助詞,格助詞,一般,*,*,*,で,デ,デ + # 大規模 名詞,一般,*,*,*,*,大規模,ダイキボ,ダイキボ + # 軍事パレード 名詞,固有名詞,一般,*,*,*,軍事パレード,グンジパレード,グンジパレード + # を 助詞,格助詞,一般,*,*,*,を,ヲ,ヲ + # 指示 名詞,サ変接続,*,*,*,*,指示,シジ,シジ + # BOS/EOS,*,*,*,*,*,*,*,* + # » + =head1 AUTHOR titsuki diff --git a/lib/MeCab/Model.pm6 b/lib/MeCab/Model.pm6 index 68b3f63..699cdfd 100644 --- a/lib/MeCab/Model.pm6 +++ b/lib/MeCab/Model.pm6 @@ -13,14 +13,42 @@ my sub mecab_model_new2(Str) returns MeCab::Model is native($library) { * } my sub mecab_model_new_tagger(MeCab::Model) returns MeCab::Tagger is native($library) { * } my sub mecab_model_new_lattice(MeCab::Model) returns MeCab::Lattice is native($library) { * } -multi method new { - my Str $argv = "-C"; - mecab_model_new2($argv) +multi submethod new { + mecab_model_new2("-C") } -multi method new(Str $extra-argv) { - my Str $argv = "-C " ~ $extra-argv; - mecab_model_new2($argv) +multi submethod new(Str $argv) { + mecab_model_new2($argv); +} + +multi submethod new( + Str :$rcfile, + Str :$dicdir, + Str :$userdic, +) { + my @args; + @args.push('-C'); # allocate-sentence + + if $rcfile.defined { + $rcfile.IO.f or die "$rcfile doesn't exist."; + $dicdir.defined or die ":rcfile requires :dicdir."; + $dicdir.IO.d or die "$dicdir doesn't exist."; + + @args.push(sprintf('-r %s', $rcfile)); + } + + if $dicdir.defined { + $dicdir.IO.d or die "$dicdir doesn't exist."; + + @args.push(sprintf('-d %s', $dicdir)); + } + + if $userdic.defined { + $userdic.IO.f or die "$userdic doesn't exist."; + + @args.push(sprintf('-u %s', $userdic)) + } + mecab_model_new2(@args.join(' ')); } method create-tagger { diff --git a/lib/MeCab/Tagger.pm6 b/lib/MeCab/Tagger.pm6 index 20ea5f7..54eb20e 100644 --- a/lib/MeCab/Tagger.pm6 +++ b/lib/MeCab/Tagger.pm6 @@ -23,12 +23,42 @@ my sub mecab_sparse_tostr3(MeCab::Tagger, size_t, Str, size_t, CArray[int8], siz my sub mecab_dictionary_info(MeCab::Tagger) returns MeCab::DictionaryInfo is native($library) { * } my sub mecab_strerror(MeCab::Tagger) returns Str is native($library) { * } -multi method new(Str $arg) { - mecab_new2($arg); +multi submethod new { + mecab_new2("-C"); } -multi method new { - mecab_new2("-C"); +multi submethod new(Str $argv) { + mecab_new2($argv); +} + +multi submethod new( + Str :$rcfile, + Str :$dicdir, + Str :$userdic, +) { + my @args; + @args.push('-C'); # allocate-sentence + + if $rcfile.defined { + $rcfile.IO.f or die "$rcfile doesn't exist."; + $dicdir.defined or die ":rcfile requires :dicdir."; + $dicdir.IO.d or die "$dicdir doesn't exist."; + + @args.push(sprintf('-r %s', $rcfile)); + } + + if $dicdir.defined { + $dicdir.IO.d or die "$dicdir doesn't exist."; + + @args.push(sprintf('-d %s', $dicdir)); + } + + if $userdic.defined { + $userdic.IO.f or die "$userdic doesn't exist."; + + @args.push(sprintf('-u %s', $userdic)); + } + mecab_new2(@args.join(' ')); } method version { From 3952783c991ae8a1796dbac48767a1c447181bcf Mon Sep 17 00:00:00 2001 From: Itsuki Toyota Date: Mon, 12 Feb 2018 17:56:50 +0900 Subject: [PATCH 2/2] Fix indent --- README.md | 4 +++- lib/MeCab.pm6 | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 27476dc..2d4a1f3 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,8 @@ mecab-ipadic-neologd * Step1: download and install neologd +Example: + $ git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git $ cd mecab-ipadic-neologd $ export PATH=$HOME/.p6mecab/bin:$PATH @@ -101,7 +103,7 @@ mecab-ipadic-neologd * Step2: Use .new(:dicdir(PATH_TO_THE_DIR)) -An example for MeCab::Tagger: +Example: use MeCab; use MeCab::Tagger; diff --git a/lib/MeCab.pm6 b/lib/MeCab.pm6 index b0dd9c1..66e0b30 100644 --- a/lib/MeCab.pm6 +++ b/lib/MeCab.pm6 @@ -147,14 +147,17 @@ Once the build starts, it automatically downloads C and C