[Hejes-devel] [825] actual addition of ohanalyze
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Thu Jun 20 14:15:36 CEST 2013
Revision: 825
Author: hussami
Date: 2013-06-20 14:15:35 +0200 (Thu, 20 Jun 2013)
Log Message:
-----------
actual addition of ohanalyze
Modified Paths:
--------------
trunk/install/Makefile
trunk/install/create_dbdict_user.sql
Added Paths:
-----------
trunk/misc/osiris_xml/ohanalyze.py
Modified: trunk/install/Makefile
===================================================================
--- trunk/install/Makefile 2013-06-19 09:14:27 UTC (rev 824)
+++ trunk/install/Makefile 2013-06-20 12:15:35 UTC (rev 825)
@@ -1,3 +1,5 @@
+PYDIR=/usr/lib/python2.7/site-packages/
+
all:
@@ -31,8 +33,8 @@
-unzip -o PyHyphen-1.0beta1.zip ; \
cd ./PyHyphen-1.0beta1 ; \
sudo python ./setup.py install
- sudo cp hyph_hu_HU.dic /usr/local/lib/python2.7/dist-packages/hyphen/
- sudo ln -s /usr/local/lib/python2.7/dist-packages/hyphen/hyph_hu_HU.dic /usr/local/lib/python2.7/dist-packages/hyphen/hu_HU
+ sudo cp hyph_hu_HU.dic $(PYDIR)/hyphen/
+ sudo ln -s $(PYDIR)/hyphen/hyph_hu_HU.dic $(PYDIR)/hyphen/hu_HU
# TODO: use the actual python dist dir, e.g. python2.6 on clara
#DEPRECATED: we must use pyhyphen-1.0 + default dictionary for now instead of pyhyphen-2.0
Modified: trunk/install/create_dbdict_user.sql
===================================================================
--- trunk/install/create_dbdict_user.sql 2013-06-19 09:14:27 UTC (rev 824)
+++ trunk/install/create_dbdict_user.sql 2013-06-20 12:15:35 UTC (rev 825)
@@ -1,4 +1,4 @@
-DROP USER dbdicter;
+#DROP USER dbdicter;
CREATE USER 'dbdicter'@'localhost' IDENTIFIED BY PASSWORD '*93AAB36C15B9F354CE87A71D52E763A83B1E666D';;
GRANT FILE ON *.* TO 'dbdicter'@'%' IDENTIFIED BY PASSWORD '*93AAB36C15B9F354CE87A71D52E763A83B1E666D';
GRANT ALL PRIVILEGES ON `dbdict`.* TO 'dbdicter'@'%';
Copied: trunk/misc/osiris_xml/ohanalyze.py (from rev 819, trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py)
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py (rev 0)
+++ trunk/misc/osiris_xml/ohanalyze.py 2013-06-20 12:15:35 UTC (rev 825)
@@ -0,0 +1,448 @@
+#!/usr/bin/env python
+"""# -*- coding: utf-8 -*- """
+# coding: utf8
+
+import sys
+import MySQLdb
+import re
+from egybekulon2_humor import StemmingAnalysis
+from egybekulon2_humor import HumorAna
+import itertools
+
+class MySQLHandler:
+ def __init__(self, verbose = 0):
+ self.connection = None
+ self.verbose = verbose
+ self.cursor = None
+ self.clear()
+
+ def clear(self):
+ self.disconnect()
+ self.cursor = None
+
+ def connect(self, server, user, pwd, dbs):
+ self.connection = MySQLdb.connect(host=server, user=user, passwd=pwd, \
+ db=dbs, charset='utf8')
+
+ def disconnect(self):
+ if self.connection != None:
+ self.connection.close()
+ self.connection = None
+
+ def execute(self, str):
+ if self.verbose > 0:
+ print str
+
+ self.cursor = self.connection.cursor()
+ self.cursor.execute(str)
+
+ def fetchall(self):
+ return self.cursor.fetchall()
+
+##########
+
+class OHTerms:
+ """A term dictionary. Downloads all terms from the db
+ """
+
+ def __init__(self, handler):
+ self.mysqlhandler = handler
+ self.data = {}
+ self.fill()
+
+ def fill(self):
+ query = "select distinct term from incidences"
+ self.mysqlhandler.execute(query)
+ self.data = {}
+ results = self.mysqlhandler.fetchall()
+ for row in results:
+ self.data[row[0].encode("utf8")] = 0
+# self.data[row[0]] = 0
+# if c == 300:
+# print row[0].encode("utf8")
+# print self.data
+# if row[0].find("agyag") > -1:
+# print row[0]
+
+ def isMember(self, s):
+ if s in self.data:
+ return True
+ return False
+
+##################
+
+class OHAnalyze:
+ def __init__(self, fill = True):
+ self.db = MySQLHandler()
+ self.db.connect("localhost", "dbdicter", "dbdicter123", "dbdict")
+ self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
+ self.splittags_ext = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3, '_IKEP': 4}
+ if fill:
+ self.fill(self.db)
+
+ # FIXME test code!
+ def fill(self):
+ self.ohterms = OHTerms(self.db)
+
+ def bye(self):
+ if self.db != None:
+ self.db.disconnect()
+
+ def output(self, s, where):
+ if where == 0: #dump to text
+ print s + ";"
+ elif where == 1:
+ query = s;
+ self.db.execute(query)
+ else:
+ pass
+
+ def tokenize(self, s):
+ """Split a string into tokens
+ Param: the target string
+ Returns: the result list
+ """
+ l = re.split(' |;|-', s)
+ result = []
+ for sub in l:
+# if sub.isdigit():
+# result.append("_NUMBER_")
+# else:
+ result.append(sub)
+ return result
+
+ def indexate(self, lst, norm):
+ """Finds an ordered set of strings in a normalized string
+ Param: the list and the normalized string
+ Returns: a list of tuples
+ Could be done differently, but it tests DB consistency somewhat.
+ """
+ mindex = 0
+ result = []
+ for l in lst:
+ ind = norm.find(l, mindex)
+ if mindex == -1:
+ raise 'Problem: ' + norm
+ #convert numbers!
+ # if l.isdigit():
+ # t = "_NUMBER_", ind
+ # else:
+ t = l, ind
+ result.append(t)
+ mindex = ind + 1;
+
+ return result
+
+ def string2intlist(self, s):
+ spl = s.split(',')
+ lst=[]
+ for sval in spl:
+ lst.append(int(sval))
+ return lst
+
+ def getAllSplits(self, s, level):
+ result = []
+ ll = len(s)
+ for i in range(2, ll + 1): #ignore single-letter entries
+ if self.ohterms.isMember(s[:i]):
+# print " "*2*level + "MEMBER: " + s[:i]
+ if i == ll:
+ locresult = []
+ locresult.append(i)
+ result.append(locresult)
+# print " "*2*level + "APPEND: " + str(locresult)
+ continue
+
+ t = self.getAllSplits(s[i:], level+1)
+ if t:
+ for resit in t:
+ locresult = []
+ locresult.append(i)
+ locresult.extend(resit)
+ result.append(locresult)
+# print " "*2*level + "APPEND: " + str(locresult)
+# print " "*2*level + str(t)
+# print " "*2*level + str(result)
+ if level == 0 and not result:
+ result.append([len(s)])
+ return result
+
+ def filterTags(self, lst, extended = False):
+ killables = []
+ if extended:
+ tbl = self.splittags_ext
+ else:
+ tbl = self.splittags
+ for anali in range(0, len(lst)):
+ for lex, tag, length in lst[anali]:
+ if tag not in tbl:
+ killables.append(anali)
+ break
+# print lst
+ for ind in reversed(killables):
+# print "kill: ", ind
+ del lst[ind]
+ return lst
+
+ def correctHumorSplit(self, lst):
+ """
+ Logic: sequence of morphs. if a_k in splittags: insert stack top, push
+ current to stack. else, change stack top. end: insert stack top.
+ """
+ result = []
+ insval = 0
+ for q1, q2, q3 in lst:
+ if q2 not in self.splittags_ext:
+ raise Exception("Data inconsistency: " + str(q3))
+ if q2 in self.splittags:
+ # print "appending: " + str(insval)
+ if insval > 0:
+ result.append(str(insval))
+ insval = q3
+ else:
+# print "goon: " + str(q3+insval)
+ insval += q3;
+ if insval > 0:
+# print "end: " + str(insval)
+ result.append(str(insval))
+ return result
+
+ def humorSplit(self, s, splitindices): #filter bad splits
+ spdict = {str(len(s)) : 1}
+
+ #first: process the whole string
+ hl = self.humorize(s)
+ if hl:
+# print hl
+ hl = self.filterTags(hl, True)
+ if hl:
+# print hl
+ for hle in hl:
+ keyl = self.correctHumorSplit(hle)
+ ss = ",".join(keyl)
+# print ss
+ spdict[ss] = 1
+
+ #for the rest: just determine if all elements pass -- if so, add
+ for i in range(0, len(splitindices)):
+ if len(splitindices[i]) == 1: #ignore full, we've done it already
+ continue
+
+ previndex = 0
+ for j in range(0, len(splitindices[i])): #for all entries in the split
+ curindex = previndex + splitindices[i][j]
+ hl = self.humorize(s[previndex:curindex])
+ if not hl: #no humor response? invalid interval, forget it
+ break
+ previndex = curindex
+
+ #ok, check if the current tag is / tags are ok
+ hl = self.filterTags(hl)
+ if not hl:
+ break
+ if j < len(splitindices[i]) - 1: #not the last elt of the split? go on
+ continue
+
+ #if we are here, that means all splits were fine
+ keyl = []
+ for qq in splitindices[i]:
+ keyl.append(str(qq))
+ ss = ','.join(ii for ii in keyl)
+# print ss
+ spdict[ss] = 1
+
+ return spdict
+
+
+ def humorize(self, s):
+ h = StemmingAnalysis(s.rstrip())
+# print "'{0}'".format(s), ":", len(h.getAnas())
+ result = []
+ for x in h.getAnas():
+ locresult = []
+ tags = [y.tag for y in x.morphs]
+ forms = [y.lex for y in x.morphs]
+ lens = [len(y.lex) for y in x.morphs]
+ if len(tags) != len(forms) or len(tags) != len(lens):
+ raise Exception(str(tags) + " vs " + str(forms))
+
+ if tags[-1] == 'NOM' or tags[-1] == 'e3':
+ del forms[-1]
+ del tags[-1]
+ del lens[-1]
+
+ for i in range(0, len(tags)):
+ t = forms[i], tags[i], lens[i]
+ locresult.append(t)
+
+ result.append(locresult)
+
+# print result
+ return result
+# self.checkTags(l)
+
+
+ def getHumorSplits(self, s):
+ h = StemmingAnalysis(s.rstrip())
+ print "'{0}'".format(s), ":", len(h.getAnas())
+ result = []
+ for x in h.getAnas():
+ locresult = []
+ tags = [y.tag for y in x.morphs]
+ forms = [y.lex for y in x.morphs]
+ lens = [len(y.lex) for y in x.morphs]
+ if len(tags) != len(forms) or len(tags) != len(lens):
+ raise Exception(str(tags) + " vs " + str(forms))
+
+ if tags[-1] == 'NOM':
+ del forms[-1]
+ del tags[-1]
+ del lens[-1]
+
+ for i in range(0, len(tags)):
+ t = forms[i], tags[i], lens[i]
+ locresult.append(t)
+
+ result.append(locresult)
+
+# print result
+# self.checkTags(l)
+ return result
+
+ def makeIncidences(self, where):
+ if where == 0:
+ self.output("use dbdict", where)
+ self.output("drop table incidences", where)
+ self.output("create table incidences(term varchar(100) " +\
+# self.output("create table incidences(term varchar(100) collate utf8_bin " +\
+ "not null, dict_id int, " + "idx int)", where)
+ self.output("create index incidence_index on incidences(term)", where)
+
+ query = "select id, actual, norm from ohdict";
+ self.db.execute(query)
+
+ results = self.db.fetchall()
+
+ counter = 0
+ for row in results:
+ counter += 1
+ news = self.tokenize(row[1])
+ idval = int(row[0])
+ il = self.indexate(news, row[2])
+ for ne, nind in il:
+ self.output("insert into incidences(term, dict_id, idx) values('" + \
+ ne + "', " + str(idval) + ", " + str(nind) + ")", where)
+ if where == 1:
+ self.db.connection.commit()
+
+ def printSplits(self, s, splits):
+ print s
+ for spk, spv in splits.iteritems():
+ previndex = 0
+ spl = spk.split(',')
+ lst=[]
+ for sval in spl:
+# print sval
+ lst.append(int(sval))
+ prl = []
+ for v in lst:
+ curindex = previndex + v
+ prl.append(s[previndex:curindex])
+ previndex = curindex
+ print "\t" + "+".join(prl)
+
+ def getFinalSplits(self, s, splits):
+ result = []
+ for spt in splits:
+ previndex = 0
+ prl = []
+ for se in spt:
+ curindex = previndex + se
+ prl.append(s[previndex:curindex])
+ previndex = curindex
+ result.append('+'.join(prl))
+ return result
+
+
+ #cartesin8 a list of dicts
+ def cartese(self, dictlist):
+ if not dictlist:
+ raise Exception("AA")
+ if len(dictlist) == 1:
+ result = []
+ for cark, carv in dictlist[0].iteritems():
+# result.append([cark])
+ result.append(self.string2intlist(cark))
+# print result
+ return result
+
+ result = []
+ nl = self.cartese(dictlist[1:])
+# print dictlist
+# print nl
+# print dictlist[0]
+ for cark, carv in dictlist[0].iteritems():
+ for nelt in nl:
+ loclist = self.string2intlist(cark)
+ loclist.extend(nelt)
+ result.append(loclist)
+# print result
+ return result
+
+ def try2(self, where):
+
+ query = "select id, actual, norm from ohdict where id > 1000 limit 200";
+ self.db.execute(query)
+
+ results = self.db.fetchall()
+
+ counter = 0
+ for row in results:
+ counter += 1
+ news = self.tokenize(row[1])
+ idval = int(row[0])
+ il = self.indexate(news, row[2])
+ totsplits = []
+ for ne, nind in il:
+ # aa=self.getHumorSplits(ne)
+ # print aa
+ termsplits = self.getAllSplits(ne, 0)
+# print ne
+# print termsplits
+ termsplits = self.humorSplit(ne, termsplits)
+# self.printSplits(ne, termsplits)
+ totsplits.append(termsplits)
+ crtl = self.cartese(totsplits)
+# print crtl
+ fin = self.getFinalSplits(row[2], crtl)
+ print row[2]
+ for qq in fin:
+ print "\t" + qq
+
+
+oh = OHAnalyze(False)
+#oh.makeIncidences(1)
+oh.fill()
+oh.try2(2)
+#oh.getAllSplits("trolibusz", 0)
+
+#print oh.getAllSplits("agyagtalajhumorista", 0)
+#ll=oh.getAllSplits("agyagtalajhumorista", 0)
+#oh.humorSplit("agyagtalajhumorista", ll)
+#print oh.getAllSplits("virslitad", 0)
+#print "---"
+#ll = oh.getAllSplits("agyagos", 0)
+#print ll
+#oh.humorSplit("agyagos", ll)
+ll=oh.getAllSplits("agyagtalaj", 0)
+spl=oh.humorSplit("agyagtalaj", ll)
+oh.printSplits("agyagtalaj", spl)
+oh.bye()
+
+###
+# NOTE: Number conversions!!!
+###
+print "REMINDER: have a list of banned morphemes, i.e. letters etc., test for these in getAllSplits"
+print "2: SQL select distinct eleg gazul birja az ekezeteket. pl kerek, ke'rek, kere'k mind ua."
+print "3 (ext. 1): hogy milyen morfemakombokat engedunk meg, sokkal bonyolultabb, mint gondolnank. pl. agrarberuhazas? agraripari?"
+print "4: IK cimkehez kell ige is!!!. pl. fuggelek nem lehet fugg+el+ek!"
More information about the Hejes-devel
mailing list