[Hejes-devel] [825] actual addition of ohanalyze

Thu Jun 20 14:15:36 CEST 2013

Revision: 825
Author:   hussami
Date:     2013-06-20 14:15:35 +0200 (Thu, 20 Jun 2013)
Log Message:
-----------
actual addition of ohanalyze

Modified Paths:
--------------
    trunk/install/Makefile
    trunk/install/create_dbdict_user.sql

Added Paths:
-----------
    trunk/misc/osiris_xml/ohanalyze.py

Modified: trunk/install/Makefile
===================================================================

--- trunk/install/Makefile	2013-06-19 09:14:27 UTC (rev 824)
+++ trunk/install/Makefile	2013-06-20 12:15:35 UTC (rev 825)
@@ -1,3 +1,5 @@
+PYDIR=/usr/lib/python2.7/site-packages/
+
 all:
 	
 
@@ -31,8 +33,8 @@
 	-unzip -o PyHyphen-1.0beta1.zip ; \
 	cd ./PyHyphen-1.0beta1 ; \
 	sudo python ./setup.py install
-	sudo cp hyph_hu_HU.dic /usr/local/lib/python2.7/dist-packages/hyphen/
-	sudo ln -s /usr/local/lib/python2.7/dist-packages/hyphen/hyph_hu_HU.dic /usr/local/lib/python2.7/dist-packages/hyphen/hu_HU
+	sudo cp hyph_hu_HU.dic $(PYDIR)/hyphen/
+	sudo ln -s $(PYDIR)/hyphen/hyph_hu_HU.dic $(PYDIR)/hyphen/hu_HU
 # TODO: use the actual python dist dir, e.g. python2.6 on clara
 
 #DEPRECATED: we must use pyhyphen-1.0 + default dictionary for now instead of pyhyphen-2.0

Modified: trunk/install/create_dbdict_user.sql
===================================================================
--- trunk/install/create_dbdict_user.sql	2013-06-19 09:14:27 UTC (rev 824)
+++ trunk/install/create_dbdict_user.sql	2013-06-20 12:15:35 UTC (rev 825)
@@ -1,4 +1,4 @@
-DROP USER dbdicter;
+#DROP USER dbdicter;
 CREATE USER 'dbdicter'@'localhost' IDENTIFIED BY PASSWORD '*93AAB36C15B9F354CE87A71D52E763A83B1E666D';;
 GRANT FILE ON *.* TO 'dbdicter'@'%' IDENTIFIED BY PASSWORD '*93AAB36C15B9F354CE87A71D52E763A83B1E666D';
 GRANT ALL PRIVILEGES ON `dbdict`.* TO 'dbdicter'@'%';

Copied: trunk/misc/osiris_xml/ohanalyze.py (from rev 819, trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py)
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py	                        (rev 0)
+++ trunk/misc/osiris_xml/ohanalyze.py	2013-06-20 12:15:35 UTC (rev 825)
@@ -0,0 +1,448 @@
+#!/usr/bin/env python
+"""# -*- coding: utf-8 -*- """
+# coding: utf8
+
+import sys
+import MySQLdb
+import re
+from egybekulon2_humor import StemmingAnalysis
+from egybekulon2_humor import HumorAna
+import itertools
+
+class MySQLHandler:
+  def __init__(self, verbose = 0):
+    self.connection = None
+    self.verbose = verbose
+    self.cursor = None
+    self.clear()
+
+  def clear(self):
+    self.disconnect()
+    self.cursor = None
+
+  def connect(self, server, user, pwd, dbs):
+    self.connection = MySQLdb.connect(host=server, user=user, passwd=pwd, \
+      db=dbs, charset='utf8')
+
+  def disconnect(self):
+    if self.connection != None:
+	self.connection.close()
+    self.connection = None
+
+  def execute(self, str):
+    if self.verbose > 0:
+	print str
+
+    self.cursor = self.connection.cursor()
+    self.cursor.execute(str)
+
+  def fetchall(self):
+    return self.cursor.fetchall()
+
+##########
+
+class OHTerms:
+  """A term dictionary. Downloads all terms from the db
+  """
+  
+  def __init__(self, handler):
+    self.mysqlhandler = handler
+    self.data = {}
+    self.fill()
+
+  def fill(self):
+    query = "select distinct term from incidences"
+    self.mysqlhandler.execute(query)
+    self.data = {}
+    results = self.mysqlhandler.fetchall()
+    for row in results:
+      self.data[row[0].encode("utf8")] = 0
+#      self.data[row[0]] = 0
+#     if c == 300:
+#       print row[0].encode("utf8")
+#       print self.data
+#      if row[0].find("agyag") > -1:
+#        print row[0]
+
+  def isMember(self, s):
+    if s in self.data:
+      return True
+    return False
+
+##################
+
+class OHAnalyze:
+  def __init__(self, fill = True):
+    self.db = MySQLHandler()
+    self.db.connect("localhost", "dbdicter", "dbdicter123", "dbdict")
+    self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
+    self.splittags_ext = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3, '_IKEP': 4}
+    if fill:
+      self.fill(self.db)
+
+  # FIXME test code!
+  def fill(self):
+    self.ohterms = OHTerms(self.db)
+
+  def bye(self):
+    if self.db != None:
+      self.db.disconnect()
+
+  def output(self, s, where):
+    if where == 0: #dump to text
+      print s + ";"
+    elif where == 1:
+      query = s;
+      self.db.execute(query)
+    else:
+      pass
+
+  def tokenize(self, s):
+    """Split a string into tokens
+       Param: the target string
+       Returns: the result list
+    """
+    l = re.split(' |;|-', s)
+    result = []
+    for sub in l:
+#      if sub.isdigit():
+#        result.append("_NUMBER_")
+#      else:
+      result.append(sub)
+    return result
+
+  def indexate(self, lst, norm):
+    """Finds an ordered set of strings in a normalized string
+       Param: the list and the normalized string
+       Returns: a list of tuples
+       Could be done differently, but it tests DB consistency somewhat.
+    """
+    mindex = 0
+    result = []
+    for l in lst:
+      ind = norm.find(l, mindex)
+      if mindex == -1:
+        raise 'Problem: ' + norm
+      #convert numbers!
+  #    if l.isdigit():
+  #      t = "_NUMBER_", ind
+  #    else:
+      t = l, ind
+      result.append(t)
+      mindex = ind + 1;
+
+    return result
+
+  def string2intlist(self, s):
+    spl = s.split(',')
+    lst=[]
+    for sval in spl:
+      lst.append(int(sval))
+    return lst
+
+  def getAllSplits(self, s, level):
+    result = []
+    ll = len(s)
+    for i in range(2, ll + 1): #ignore single-letter entries
+      if self.ohterms.isMember(s[:i]):
+#        print " "*2*level + "MEMBER: " + s[:i]
+        if i == ll:
+          locresult = []
+          locresult.append(i)
+          result.append(locresult)
+#          print " "*2*level + "APPEND: " + str(locresult)
+          continue
+
+        t = self.getAllSplits(s[i:], level+1)
+        if t:
+          for resit in t:
+            locresult = []
+            locresult.append(i)
+            locresult.extend(resit)
+            result.append(locresult)
+#            print " "*2*level + "APPEND: " + str(locresult)
+#        print " "*2*level + str(t)
+#    print " "*2*level + str(result)
+    if level == 0 and not result:
+      result.append([len(s)])
+    return result
+
+  def filterTags(self, lst, extended = False):
+    killables = []
+    if extended:
+      tbl = self.splittags_ext
+    else:
+      tbl = self.splittags
+    for anali in range(0, len(lst)):
+      for lex, tag, length in lst[anali]:
+        if tag not in tbl:
+          killables.append(anali)
+          break
+#    print lst
+    for ind in reversed(killables):
+#      print "kill: ", ind
+      del lst[ind]
+    return lst
+
+  def correctHumorSplit(self, lst):
+    """
+       Logic: sequence of morphs. if a_k in splittags: insert stack top, push
+       current to stack. else, change stack top. end: insert stack top.
+    """
+    result = []
+    insval = 0
+    for q1, q2, q3 in lst:
+      if q2 not in self.splittags_ext:
+        raise Exception("Data inconsistency: " + str(q3))
+      if q2 in self.splittags:
+ #       print "appending: " + str(insval)
+        if insval > 0:
+          result.append(str(insval))
+        insval = q3
+      else:
+#        print "goon: " + str(q3+insval)
+        insval += q3;
+    if insval > 0:
+#      print "end: " + str(insval)
+      result.append(str(insval))
+    return result
+
+  def humorSplit(self, s, splitindices): #filter bad splits
+    spdict = {str(len(s)) : 1}
+    
+    #first: process the whole string
+    hl = self.humorize(s)
+    if hl:
+#      print hl
+      hl = self.filterTags(hl, True)
+      if hl:
+#        print hl
+        for hle in hl:
+          keyl = self.correctHumorSplit(hle)
+          ss = ",".join(keyl)
+#          print ss
+          spdict[ss] = 1
+
+    #for the rest: just determine if all elements pass -- if so, add
+    for i in range(0, len(splitindices)):
+      if len(splitindices[i]) == 1: #ignore full, we've done it already
+        continue
+
+      previndex = 0
+      for j in range(0, len(splitindices[i])): #for all entries in the split
+        curindex = previndex + splitindices[i][j]
+        hl = self.humorize(s[previndex:curindex])
+        if not hl: #no humor response? invalid interval, forget it
+          break
+        previndex = curindex
+
+        #ok, check if the current tag is / tags are ok
+        hl = self.filterTags(hl)
+        if not hl:
+          break
+        if j < len(splitindices[i]) - 1: #not the last elt of the split? go on
+          continue
+
+        #if we are here, that means all splits were fine
+        keyl = []
+        for qq in splitindices[i]:
+          keyl.append(str(qq))
+        ss = ','.join(ii for ii in keyl)
+#        print ss
+        spdict[ss] = 1
+
+    return spdict
+
+
+  def humorize(self, s):
+    h = StemmingAnalysis(s.rstrip())
+#    print "'{0}'".format(s), ":", len(h.getAnas())
+    result = []
+    for x in h.getAnas():
+      locresult = []
+      tags = [y.tag for y in x.morphs]
+      forms = [y.lex for y in x.morphs]
+      lens = [len(y.lex) for y in x.morphs]
+      if len(tags) != len(forms) or len(tags) != len(lens):
+        raise Exception(str(tags) + " vs " + str(forms))
+
+      if tags[-1] == 'NOM' or tags[-1] == 'e3':
+        del forms[-1]
+        del tags[-1]
+        del lens[-1]
+
+      for i in range(0, len(tags)):
+        t = forms[i], tags[i], lens[i]
+        locresult.append(t)
+
+      result.append(locresult)
+
+#    print result
+    return result
+#      self.checkTags(l)
+
+
+  def getHumorSplits(self, s):
+    h = StemmingAnalysis(s.rstrip())
+    print "'{0}'".format(s), ":", len(h.getAnas())
+    result = []
+    for x in h.getAnas():
+      locresult = []
+      tags = [y.tag for y in x.morphs]
+      forms = [y.lex for y in x.morphs]
+      lens = [len(y.lex) for y in x.morphs]
+      if len(tags) != len(forms) or len(tags) != len(lens):
+        raise Exception(str(tags) + " vs " + str(forms))
+
+      if tags[-1] == 'NOM':
+        del forms[-1]
+        del tags[-1]
+        del lens[-1]
+
+      for i in range(0, len(tags)):
+        t = forms[i], tags[i], lens[i]
+        locresult.append(t)
+
+      result.append(locresult)
+
+#    print result
+#      self.checkTags(l)
+    return result
+
+  def makeIncidences(self, where):
+    if where == 0:
+      self.output("use dbdict", where)
+    self.output("drop table incidences", where)
+    self.output("create table incidences(term varchar(100) " +\
+#    self.output("create table incidences(term varchar(100) collate utf8_bin " +\
+      "not null, dict_id int, " + "idx int)", where)
+    self.output("create index incidence_index on incidences(term)", where)
+
+    query = "select id, actual, norm from ohdict";
+    self.db.execute(query)
+
+    results = self.db.fetchall()
+
+    counter = 0
+    for row in results:
+      counter += 1
+      news = self.tokenize(row[1])
+      idval = int(row[0])
+      il = self.indexate(news, row[2])
+      for ne, nind in il:
+        self.output("insert into incidences(term, dict_id, idx) values('" + \
+          ne + "', " + str(idval) + ", " + str(nind) + ")", where)
+    if where == 1:
+      self.db.connection.commit()
+
+  def printSplits(self, s, splits):
+    print s
+    for spk, spv in splits.iteritems():
+      previndex = 0
+      spl = spk.split(',')
+      lst=[]
+      for sval in spl:
+#        print sval
+        lst.append(int(sval))
+      prl = []
+      for v in lst:
+        curindex = previndex + v
+        prl.append(s[previndex:curindex])
+        previndex = curindex
+      print "\t" + "+".join(prl)
+
+  def getFinalSplits(self, s, splits):
+    result = []
+    for spt in splits:
+      previndex = 0
+      prl = []
+      for se in spt:
+        curindex = previndex + se
+        prl.append(s[previndex:curindex])
+        previndex = curindex
+      result.append('+'.join(prl))
+    return result
+
+
+  #cartesin8 a list of dicts
+  def cartese(self, dictlist): 
+    if not dictlist:
+      raise Exception("AA")
+    if len(dictlist) == 1:
+      result = []
+      for cark, carv in dictlist[0].iteritems():
+#        result.append([cark])
+        result.append(self.string2intlist(cark))
+#      print result
+      return result
+
+    result = []
+    nl = self.cartese(dictlist[1:])
+#    print dictlist
+#    print nl
+#    print dictlist[0]
+    for cark, carv in dictlist[0].iteritems():
+      for nelt in nl:
+        loclist = self.string2intlist(cark)
+        loclist.extend(nelt)
+        result.append(loclist)
+#    print result
+    return result
+
+  def try2(self, where):
+
+    query = "select id, actual, norm from ohdict where id > 1000 limit 200";
+    self.db.execute(query)
+
+    results = self.db.fetchall()
+
+    counter = 0
+    for row in results:
+      counter += 1
+      news = self.tokenize(row[1])
+      idval = int(row[0])
+      il = self.indexate(news, row[2])
+      totsplits = []
+      for ne, nind in il:
+    #    aa=self.getHumorSplits(ne)
+    #    print aa
+        termsplits = self.getAllSplits(ne, 0)
+#        print ne
+#        print termsplits
+        termsplits = self.humorSplit(ne, termsplits)
+#        self.printSplits(ne, termsplits)
+        totsplits.append(termsplits)
+      crtl = self.cartese(totsplits)
+#      print crtl
+      fin = self.getFinalSplits(row[2], crtl)
+      print row[2]
+      for qq in fin:
+        print "\t" + qq
+
+
+oh = OHAnalyze(False)
+#oh.makeIncidences(1)
+oh.fill()
+oh.try2(2)
+#oh.getAllSplits("trolibusz", 0)
+
+#print oh.getAllSplits("agyagtalajhumorista", 0)
+#ll=oh.getAllSplits("agyagtalajhumorista", 0)
+#oh.humorSplit("agyagtalajhumorista", ll)
+#print oh.getAllSplits("virslitad", 0)
+#print "---"
+#ll = oh.getAllSplits("agyagos", 0)
+#print ll
+#oh.humorSplit("agyagos", ll)
+ll=oh.getAllSplits("agyagtalaj", 0)
+spl=oh.humorSplit("agyagtalaj", ll)
+oh.printSplits("agyagtalaj", spl)
+oh.bye()
+
+###
+# NOTE: Number conversions!!!
+###
+print "REMINDER: have a list of banned morphemes, i.e. letters etc., test for these in getAllSplits"
+print "2: SQL select distinct eleg gazul birja az ekezeteket. pl kerek, ke'rek, kere'k mind ua."
+print "3 (ext. 1): hogy milyen morfemakombokat engedunk meg, sokkal bonyolultabb, mint gondolnank. pl. agrarberuhazas? agraripari?"
+print "4: IK cimkehez kell ige is!!!. pl. fuggelek nem lehet fugg+el+ek!"