[Hejes-devel] [872] midday commit, some bugs fixed

Sun Jul 28 12:32:00 CEST 2013

Revision: 872
Author:   hussami
Date:     2013-07-28 12:32:00 +0200 (Sun, 28 Jul 2013)
Log Message:
-----------
midday commit, some bugs fixed

Modified Paths:
--------------
    trunk/misc/osiris_xml/ohanalyze.py

Added Paths:
-----------
    trunk/misc/osiris_xml/mysqlhandler.py

Added: trunk/misc/osiris_xml/mysqlhandler.py
===================================================================

--- trunk/misc/osiris_xml/mysqlhandler.py	                        (rev 0)
+++ trunk/misc/osiris_xml/mysqlhandler.py	2013-07-28 10:32:00 UTC (rev 872)
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import MySQLdb
+
+class MySQLHandler:
+  def __init__(self, verbose = 0):
+    self.connection = None
+    self.verbose = verbose
+    self.cursor = None
+    self.clear()
+
+  def clear(self):
+    self.disconnect()
+    self.cursor = None
+
+  def connect(self, server, user, pwd, dbs):
+    self.connection = MySQLdb.connect(host=server, user=user, passwd=pwd, \
+#      db=dbs)
+      db=dbs, charset='utf8')
+
+  def disconnect(self):
+    if self.connection != None:
+	self.connection.close()
+    self.connection = None
+
+  def execute(self, str):
+    if self.verbose > 0:
+	print str
+
+    self.cursor = self.connection.cursor()
+    self.cursor.execute(str)
+
+  def fetchall(self):
+    return self.cursor.fetchall()
+
+

Modified: trunk/misc/osiris_xml/ohanalyze.py
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py	2013-07-26 17:14:18 UTC (rev 871)
+++ trunk/misc/osiris_xml/ohanalyze.py	2013-07-28 10:32:00 UTC (rev 872)
@@ -3,45 +3,14 @@
 # coding: utf8
 
 import sys
-import MySQLdb
 import copy
 import re
 sys.path.append("../../web2py/applications/helyesiras_webdev/modules")
 from egybekulon2_humor import StemmingAnalysis
 from egybekulon2_humor import HumorAna
 import itertools
+from mysqlhandler import MySQLHandler
 
-class MySQLHandler:
-  def __init__(self, verbose = 0):
-    self.connection = None
-    self.verbose = verbose
-    self.cursor = None
-    self.clear()
-
-  def clear(self):
-    self.disconnect()
-    self.cursor = None
-
-  def connect(self, server, user, pwd, dbs):
-    self.connection = MySQLdb.connect(host=server, user=user, passwd=pwd, \
-#      db=dbs)
-      db=dbs, charset='utf8')
-
-  def disconnect(self):
-    if self.connection != None:
-	self.connection.close()
-    self.connection = None
-
-  def execute(self, str):
-    if self.verbose > 0:
-	print str
-
-    self.cursor = self.connection.cursor()
-    self.cursor.execute(str)
-
-  def fetchall(self):
-    return self.cursor.fetchall()
-
 ##########
 
 class OHTerms:
@@ -79,6 +48,7 @@
   def __init__(self, fill = True):
     self.db = MySQLHandler()
     self.db.connect("localhost", "dbdicter", "dbdicter123", "dbdict")
+#    self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
     self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
     self.splittags_ext = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3, '_IKEP': 4}
     if fill:
@@ -225,6 +195,41 @@
       result.append(insval)
     return result
 
+  def correctHumorSplit2(self, lst):
+    """
+       Logic: sequence of morphs. if a_k in splittags: insert stack top, push
+       current to stack. else, change stack top. end: insert stack top.
+    """
+    result = []
+    insval = 0
+    inslemma = ''
+    instag = ''
+    lasttag = ''
+    for q1, q2, q3 in lst:
+#      if q2 not in self.splittags_ext:
+#        raise Exception("Data inconsistency: " + str(q3))
+      if (q2 in self.splittags) and not (q2 == 'IGE' and lasttag == 'IK'):
+ #       print "appending: " + str(insval)
+        if insval > 0:
+          result.append((inslemma, instag, insval))
+        inslemma = q1
+        instag = q2
+        insval = q3
+      else:
+#        print "goon: " + str(q3+insval)
+        inslemma += q1
+        instag += q2
+        insval += q3;
+
+      lasttag = q2
+
+    if insval > 0:
+#      print "end: " + str(insval)
+      result.append((inslemma, instag, insval))
+    return result
+
+
+
   def splitString(self, s, splitlstr):
     result = []
     splitlist = self.string2intlist(splitlstr)
@@ -235,34 +240,38 @@
       previndex += i
     return result
 
-  def recurse(self, s, dct, startind):
-#    print "Start with", s, startind
+  def recurse(self, s, dct, startind, level = 0):
+    """
+      Recursively produce an analysis. Outputs to a dict.
+      The algorithm here:
+        1. make humor analysis of s, which is the substring of the total \
+            starting at index startind
+        2. for all analyses: correct them
+    """
+#    print "    Start with", s, startind, "on level", level
     hl = self.humorize(s)
-#    print "hl=", s, hl
+#    print "\thl=", s, hl
     if startind not in dct:
 #      print "insert: ", s, startind
       dct[startind] = {len(s) : 1}
-#    if len(hl) == 1:
-#      if len(hl[0]) == 1:
-#        return
-    for hle in hl:
-#      print hle
-      keyl = self.correctHumorSplit(hle)
-#      dct[s].append(keyl)
-      dct[startind][len(s)] = 1
-      pind = 0
-      for k in keyl:
-        temps = s[pind:pind + k]
-#        print "temps", temps, startind, pind
-        if (startind + pind) not in dct:
-          self.recurse(temps, dct, startind + pind)
-        elif len(temps) not in dct[startind + pind]:
-          self.recurse(temps, dct, startind + pind)
-#        else:
-#          print "already done it!"
-        pind += k
-#      print "K", keyl
 
+    for ana in hl:
+      for hle in ana:
+        keyl = [q3 for _,_,q3 in hle]
+        dct[startind][len(s)] = 1
+        pind = 0
+        for k in keyl:
+          temps = s[pind:pind + k]
+#          print "temps", temps, startind, pind
+          if (startind + pind) not in dct:
+            self.recurse(temps, dct, startind + pind, level + 1)
+          elif len(temps) not in dct[startind + pind]:
+            self.recurse(temps, dct, startind + pind, level + 1)
+#          else:
+#            print "already done it!"
+          pind += k
+#        print "K", keyl
+
   def analyzeRecursion(self, s, dct, ind, lst, result):
     l = len(s)
 #    print "anal:", ind, l
@@ -287,7 +296,10 @@
 
     
 
-  def humorSplit(self, s, splits): #filter bad splits
+  def humorSplit(self, s):
+    """
+      Split a string using an iterative function called recurse.
+    """
     dct = {}
     self.recurse(s, dct, 0)
 #    print "dct=", dct
@@ -298,71 +310,82 @@
 #    print "----"
     return resl
 
-##cut out
-    nomoresplits = {}
-    maygoon = {}
-    for split, v in splits.iteritems():
-      sl = self.splitString(s, split) #has all substrings
-      addedFull = False
+  def doStars(self, s, result):
+    #first version: no depth
+    index = s.find('*')
+    if index == -1:
+      return (s, 'FN', len(s))
 
+    else:
+      ls = s[0:index]
+      rs = s[index+1:]
+      smrs = rs.replace("*", "")
 
-      print "r1"
-      for le in sl:
-        hl = self.humorize(le)
-        print hl
-        if hl:
-          print hl
-          for hle in hl:
-            print hle
-            keyl = self.correctHumorSplit(hle)
-            ss = ",".join(keyl)
-            print keyl
-            if len(keyl) == 1:
-              addedFull = True
-              nomoresplitsstr[len(le)] = 1
-            else:
-              maygoon[ss] = 1
-            print ss
-          if len(le) == len(s) and not addedFull:
-            nomoresplits[str(len(le))] = 1
+      #add first
+      result.append([(ls + smrs, 'FN', index + len(smrs))])
 
-    return maygoon, nomoresplits
+      #add second
+      lst = []
+      lst.append((ls, 'FN', index)) #index is shorter than len(ls)
+      lst.append((smrs, 'FN', len(smrs)))
+      result.append(lst)
 
+  def addAnalysis(self, s, morphlist, result):
+    relevant = self.correctHumorSplit2(\
+        [(y.lex, y.tag, len(y.lex)) for y in morphlist])
+    t1,t2,t3 = relevant[-1]
 
+    if t2 == 'NOM' or t2 == 'e3':
+      print "DELLA!"
+      del relevant[-1]
+
+    lens = [y for _,_,y in relevant]
+    sl = sum(lens)
+    ls = len(s)
+    if sl < ls:
+#      print "DISCREPA!", s
+      return 0
+
+    if (sl > ls) and (len(relevant) == 1):
+      self.doStars(relevant[0][0], result)
+      return 1
+
+    tags = [y for _,y,_ in relevant]
+    forms = [y.replace("8", "") for y,_,_ in relevant]
+    lens = [len(y) for y in forms]
+    if len(tags) != len(forms) or len(tags) != len(lens):
+      raise Exception(str(tags) + " vs " + str(forms))
+
+    lst = []
+    for i in range(0, len(tags)):
+      t = forms[i], tags[i], lens[i]
+      lst.append(t)
+    result.append(lst)
+    return 2
+
   def humorize(self, s):
+    """
+      Get the humor analysis for the string. Filtering based on tag is done
+      by a sub-function.
+      Returns tuples of length 3: (tag, lemma, len(lemma))
+    """
     s = s.rstrip()
     h = StemmingAnalysis(s, True)
 #    print "'{0}'".format(s), ":", len(h.getAnas())
     result = []
+
     for x in h.getAnas():
       locresult = []
-      tags = [y.tag for y in x.morphs]
-      forms = [y.lex for y in x.morphs]
-      lens = [len(y.lex) for y in x.morphs]
-      if len(tags) != len(forms) or len(tags) != len(lens):
-        raise Exception(str(tags) + " vs " + str(forms))
+      status = self.addAnalysis(s, x.morphs, locresult)
 
-      if tags[-1] == 'NOM' or tags[-1] == 'e3':
-        del forms[-1]
-        del tags[-1]
-        del lens[-1]
-
-      for i in range(0, len(tags)):
-        t = forms[i], tags[i], lens[i]
-        locresult.append(t)
-#      print "2:", locresult
-
       if locresult:
-        locresult = self.filterTags2(locresult, True)
-#      print "3:", locresult
-      if locresult:
         result.append(locresult)
 
 #    print result
-    if not result: #by default return the whole thing
+#    if isRoot and (not result): #by default return the whole thing
+    if True:
       t = s, 'FINISH', len(s)
-      result.append([t])
-#    print result
+      result.append([[t]])
     return result
 #      self.checkTags(l)
 
@@ -447,8 +470,11 @@
 
   def try2(self, where):
 
-#    query = "select id, actual, norm from ohdict where id > 1000 limit 200";
-    query = "select id, actual, norm from ohdict order by id asc";
+#    query = "select id, actual, norm from ohdict where id > 1000 limit 500";
+    query = "select id, actual, norm from ohdict where id > 1212 limit 2";
+#    query = "select id, actual, norm from ohdict order by id asc";
+#    query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
+#107148: recursion depth!
     self.db.execute(query)
 
     results = self.db.fetchall()
@@ -461,13 +487,7 @@
       il = self.indexate(news, row[2])
       totsplits = []
       for ne, nind in il:
-#        termsplits = self.getAllSplits(ne, 0)
-#        print "ne=", ne
-#        print termsplits
-#        spdict = {str(len(ne)) : 1}
-#        print "_____"
-        spdict = {'6,8' : 1}
-        termsplits = self.humorSplit(ne, spdict)
+        termsplits = self.humorSplit(ne)
 
 #        self.printSplits(ne, termsplits)
 #        print termsplits
@@ -476,7 +496,7 @@
       crtl = self.cartese(totsplits)
 #      print crtl
       fin = self.getFinalSplits(row[2], crtl)
-      print row[2]
+      print row[0], row[2]
       for qq in fin:
         print "\t" + qq