[[ToDo]]
類似文章の比較するために作ったバギースクリプト。
[[mecab:http://chasen.org/~taku/software/mecab/]]を使用。
mecab_test.py
# -*- coding: cp932 -*-
from ctypes import *
class Mecab_Test:
def __init__(self):
self.lib=cdll.LoadLibrary("C:/Program Files/MeCab/bin/libmecab.dll")
getattr(self.lib, "<b style="color:black;background-color:#ffff66">mecab_new2</b>")
self.lib.<b style="color:black;background-color:#ffff66">mecab_new2</b>.restype = c_void_p
self.lib.<b style="color:black;background-color:#ffff66">mecab_new2</b>.argtypes = [c_char_p, c_char_p]
getattr(self.lib, "mecab_sparse_tostr")
self.lib.mecab_sparse_tostr.restype = c_char_p
self.m = self.lib.<b style="color:black;background-color:#ffff66">mecab_new2</b>('-a', 'c://Progmecab/bin/libmecab.dll')
def conv_mecab(self,str):
return self.lib.mecab_sparse_tostr(self.m,str)
if __name__=="__main__":
str='今日は元気だ。'
m=Mecab_Test()
print m.conv_mecab(str)
str='明日は雨だろう。'
print m.conv_mecab(str);
test.py
# -*- coding: cp932 -*-
import re,string,mecab_test
filename='sample6.txt'
inputfile=open(filename,'r')
lines=inputfile.readlines();
inputfile.close()
m_test=mecab_test.Mecab_Test()
class CharSet:
def __init__(self):
self.first_array=[]
self.second_array=[]
def drawArray(self):
for line in self.second_array:
print line
cs=CharSet()
def analizer_str(strs):
str_array=string.split(strs,'\n')
for basestr in str_array:
if(basestr=='EOS'):
cs.second_array.append(cs.first_array)
cs.first_array=[]
break
(b1,b2)=string.split(basestr,'\t')
if re.match(r'名詞',b2):
b2=string.split(b2,',')
if re.match('一般',b2[1]):
cs.first_array.append(b1)
# print b1
# if re.search(r'一般',b2):
# print 'str1 %s youso %s' % (b1,b2)
for line in lines:
line=string.strip(line)
strs=string.split(line,'。')
for str in strs:
str=str+'。'
if str == '。':break
analizer_str(m_test.conv_mecab(str))
cs.drawArray()