[Hejes-devel] [945] Zsofi's program is ready, a lot of code-cleaning done.

hejes-devel at nytud.hu hejes-devel at nytud.hu
Wed Sep 18 12:40:32 CEST 2013


Revision: 945
Author:   hussami
Date:     2013-09-18 12:40:32 +0200 (Wed, 18 Sep 2013)
Log Message:
-----------
Zsofi's program is ready, a lot of code-cleaning done.
Usage: "ohanalyze.py 2" for Zsofi functionality, "ohanalyze.py 3" for SQL.

Modified Paths:
--------------
    trunk/misc/osiris_xml/ohanalyze.py

Added Paths:
-----------
    trunk/misc/osiris_xml/attic/
    trunk/misc/osiris_xml/attic/ohanal.incidnc.py

Added: trunk/misc/osiris_xml/attic/ohanal.incidnc.py
===================================================================
--- trunk/misc/osiris_xml/attic/ohanal.incidnc.py	                        (rev 0)
+++ trunk/misc/osiris_xml/attic/ohanal.incidnc.py	2013-09-18 10:40:32 UTC (rev 945)
@@ -0,0 +1,587 @@
+#!/usr/bin/env python
+"""# -*- coding: utf-8 -*- """
+# coding: utf8
+
+import sys
+import time
+import copy
+import re
+sys.path.append("../../web2py/applications/helyesiras_webdev/modules")
+from egybekulon2_humor import StemmingAnalysis
+from egybekulon2_humor import HumorAna
+import itertools
+from mysqlhandler import MySQLHandler
+
+##########
+
+class OHTerms:
+  """A term dictionary. Downloads all terms from the db
+  """
+  
+  def __init__(self, handler):
+    self.mysqlhandler = handler
+    self.data = {}
+    self.fill()
+
+  def fill(self):
+    query = "select distinct term from incidences"
+    self.mysqlhandler.execute(query)
+    self.data = {}
+    results = self.mysqlhandler.fetchall()
+    for row in results:
+#      print row[0]
+#      self.data[row[0].encode("utf8")] = 0
+      self.data[row[0]] = 0
+#     if c == 300:
+#       print row[0].encode("utf8")
+#       print self.data
+#      if row[0].find("agyag") > -1:
+#        print row[0]
+
+  def isMember(self, s):
+    if s in self.data:
+      return True
+    return False
+
+##################
+
+class OHAnalyze:
+  def __init__(self, fill = True):
+    self.db = MySQLHandler()
+    self.db.connect("localhost", "dbdicter", "dbdicter123", "dbdict")
+#    self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
+    self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
+    self.splittags_ext = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3, '_IKEP': 4}
+    self.naughtychars = ' |;|-|/' #the separators we'll observe
+    if fill:
+      self.fill(self.db)
+
+  # FIXME test code!
+  def fill(self):
+    self.ohterms = OHTerms(self.db)
+
+  def bye(self):
+    if self.db != None:
+      self.db.disconnect()
+
+  def output(self, s, where):
+    if where == 0: #dump to text
+      print s + ";"
+    elif where == 1:
+      query = s;
+      self.db.execute(query)
+    else:
+      pass
+
+  def tokenize(self, s):
+    """Split a string into tokens
+       Param: the target string
+       Returns: the result list
+    """
+#    l = re.split(' |;|-|/', s)
+    l = re.split(self.naughtychars, s)
+    l2 = []
+    for le in l:
+      if le:
+        l2.append(le)
+    l = l2
+#    print l
+    result = []
+    for sub in l:
+#      if sub.isdigit():
+#        result.append("_NUMBER_")
+#      else:
+      result.append(sub)
+    return result
+
+  def indexate(self, lst, norm, ind_type = 2):
+    """Finds an ordered set of strings in a normalized string
+       Param: the list and the normalized string
+       Returns: a list of tuples
+       Could be done differently, but it tests DB consistency somewhat.
+    """
+    mindex = 0
+    result = []
+    for li, le in enumerate(lst):
+      ind = norm.find(le, mindex)
+      if mindex == -1:
+        raise 'Problem: ' + norm
+      #convert numbers!
+  #    if l.isdigit():
+  #      t = "_NUMBER_", ind
+  #    else:
+      if ind_type == 1:
+        t = le, ind
+      else:
+        t = le, li
+      result.append(t)
+      mindex = ind + 1;
+
+#    print "indexate result=", result
+    return result
+
+
+  def string2intlist(self, s):
+    spl = s.split(',')
+    lst=[]
+    for sval in spl:
+      lst.append(int(sval))
+    return lst
+
+  def getAllSplits(self, s, level):
+    result = []
+    ll = len(s)
+    for i in range(2, ll + 1): #ignore single-letter entries
+      if self.ohterms.isMember(s[:i]):
+#        print " "*2*level + "MEMBER: " + s[:i]
+        if i == ll:
+          locresult = []
+          locresult.append(i)
+          result.append(locresult)
+#          print " "*2*level + "APPEND: " + str(locresult)
+          continue
+
+        t = self.getAllSplits(s[i:], level+1)
+        if t:
+          for resit in t:
+            locresult = []
+            locresult.append(i)
+            locresult.extend(resit)
+            result.append(locresult)
+#            print " "*2*level + "APPEND: " + str(locresult)
+#        print " "*2*level + str(t)
+#    print " "*2*level + str(result)
+    if level == 0 and not result:
+      result.append([len(s)])
+    return result
+
+  def filterTags(self, lst, extended = False):
+    killables = []
+    if extended:
+      tbl = self.splittags_ext
+    else:
+      tbl = self.splittags
+    for anali in range(0, len(lst)):
+      for lex, tag, length in lst[anali]:
+        if tag not in tbl:
+          killables.append(anali)
+          break
+#    print lst
+    for ind in reversed(killables):
+#      print "kill: ", ind
+      del lst[ind]
+    return lst
+
+  def filterTags2(self, lst, extended = False):
+    if extended:
+      tbl = self.splittags_ext
+    else:
+      tbl = self.splittags
+    for i in range(0, len(lst)):
+      lex, tag, length = lst[i]
+      if tag not in tbl:
+        return []
+
+    return lst
+
+
+  def correctHumorSplit(self, lst):
+    """
+       Logic: sequence of morphs. if a_k in splittags: insert stack top, push
+       current to stack. else, change stack top. end: insert stack top.
+    """
+    result = []
+    insval = 0
+    for q1, q2, q3 in lst:
+#      if q2 not in self.splittags_ext:
+#        raise Exception("Data inconsistency: " + str(q3))
+      if q2 in self.splittags:
+ #       print "appending: " + str(insval)
+        if insval > 0:
+          result.append(insval)
+        insval = q3
+      else:
+#        print "goon: " + str(q3+insval)
+        insval += q3;
+    if insval > 0:
+#      print "end: " + str(insval)
+      result.append(insval)
+    return result
+
+  def correctHumorSplit2(self, lst):
+    """
+       Logic: sequence of morphs. if a_k in splittags: insert stack top, push
+       current to stack. else, change stack top. end: insert stack top.
+    """
+    result = []
+    insval = 0
+    inslemma = ''
+    instag = ''
+    lasttag = ''
+    for q1, q2, q3 in lst:
+#      if q2 not in self.splittags_ext:
+#        raise Exception("Data inconsistency: " + str(q3))
+      if (q2 in self.splittags) and not (q2 == 'IGE' and lasttag == 'IK'):
+ #       print "appending: " + str(insval)
+        if insval > 0:
+          result.append((inslemma, instag, insval))
+        inslemma = q1
+        instag = q2
+        insval = q3
+      else:
+#        print "goon: " + str(q3+insval)
+        inslemma += q1
+        instag += q2
+        insval += q3;
+
+      lasttag = q2
+
+    if insval > 0:
+#      print "end: " + str(insval)
+      result.append((inslemma, instag, insval))
+    return result
+
+
+
+  def splitString(self, s, splitlstr):
+    result = []
+    splitlist = self.string2intlist(splitlstr)
+ #   print splitlstr, splitlist
+    previndex = 0
+    for i in splitlist:
+      result.append(s[previndex:previndex + i])
+      previndex += i
+    return result
+
+  def recurse(self, s, dct, startind, level = 0):
+    """
+      Recursively produce an analysis. Outputs to a dict.
+      The algorithm here:
+        1. make humor analysis of s, which is the substring of the total \
+            starting at index startind
+        2. for all analyses: correct them
+    """
+    if len(s) == 0:
+#      dct[startind] = {len(s) : 1}
+      print "EXMPY", dct
+      return
+#    print "    Start with", s, startind, "on level", level
+    hl = self.humorize(s)
+#    print "\thl=", s, hl
+    if startind not in dct:
+#      print "insert: ", s, startind
+      dct[startind] = {len(s) : 1}
+
+    for ana in hl:
+      for hle in ana:
+        keyl = [q3 for _,_,q3 in hle]
+        dct[startind][len(s)] = 1
+        pind = 0
+        for k in keyl:
+          temps = s[pind:pind + k]
+#          print "temps", temps, startind, pind
+          if (startind + pind) not in dct:
+            self.recurse(temps, dct, startind + pind, level + 1)
+          elif len(temps) not in dct[startind + pind]:
+            self.recurse(temps, dct, startind + pind, level + 1)
+#          else:
+#            print "already done it!"
+          pind += k
+#        print "K", keyl
+
+  def analyzeRecursion(self, s, dct, ind, lst, result):
+    l = len(s)
+#    print "anal:", ind, l
+#    print dct
+    if ind not in dct:
+      return
+#    print dct[ind]
+    for k, v in dct[ind].iteritems():
+#      print "?: ", k
+      if (k + ind == l):
+#        print "OK!", l, ind, k
+        lst.append(k)
+        result.append(copy.deepcopy(lst))
+        del lst[-1]
+      else:
+#        print "notok:", ind, k
+        lst.append(k)
+        self.analyzeRecursion(s, dct, ind + k, lst, result)
+        del lst[-1]
+#        print "\tback from notok", ind
+
+#    print "result=", result
+
+    
+
+  def humorSplit(self, s):
+    """
+      Split a string using an iterative function called recurse.
+    """
+    dct = {}
+    self.recurse(s, dct, 0)
+#    print "dct=", s, dct
+    lll = []
+    resl = []
+    self.analyzeRecursion(s, dct, 0, lll, resl)
+#    print resl
+#    print "----"
+    return resl
+
+  def doStars(self, s, result):
+    #first version: no depth
+    index = s.find('*')
+    if index == -1:
+      return (s, 'FN', len(s))
+
+    else:
+      ls = s[0:index]
+      rs = s[index+1:]
+      smrs = rs.replace("*", "")
+
+      #add first
+      result.append([(ls + smrs, 'FN', index + len(smrs))])
+
+      #add second
+      lst = []
+      lst.append((ls, 'FN', index)) #index is shorter than len(ls)
+      lst.append((smrs, 'FN', len(smrs)))
+      result.append(lst)
+
+  def addAnalysis(self, s, morphlist, result):
+#    print [y.lex for y in morphlist]
+    relevant = self.correctHumorSplit2(\
+        [(y.lex, y.tag, len(y.lex)) for y in morphlist])
+    if not relevant:
+      return 0
+    t1,t2,t3 = relevant[-1]
+
+    if t2 == 'NOM' or t2 == 'e3':
+#      print "DELLA!"
+      del relevant[-1]
+
+    lens = [y for _,_,y in relevant]
+
+    starred = False
+    for t1, t2, t3 in relevant:
+      if t1.find("*") != -1:
+        starred = True
+    sl = sum(lens)
+    ls = len(s)
+    if (sl != ls) and (not starred):
+#      print "DISCREPA!", s
+      return 0
+
+    if (sl > ls) and (len(relevant) == 1):
+      self.doStars(relevant[0][0], result)
+      return 1
+
+    tags = [y for _,y,_ in relevant]
+    forms = [y.replace("*", "") for y,_,_ in relevant]
+    lens = [len(y) for y in forms]
+    if len(tags) != len(forms) or len(tags) != len(lens):
+      raise Exception(str(tags) + " vs " + str(forms))
+
+    lst = []
+    for i in range(0, len(tags)):
+      t = forms[i], tags[i], lens[i]
+      lst.append(t)
+    result.append(lst)
+    return 2
+
+  def humorize(self, s):
+    """
+      Get the humor analysis for the string. Filtering based on tag is done
+      by a sub-function.
+      Returns tuples of length 3: (tag, lemma, len(lemma))
+    """
+    s = s.rstrip()
+    h = StemmingAnalysis(s, True)
+#    print "'{0}'".format(s), ":", len(h.getAnas())
+    result = []
+
+    for x in h.getAnas():
+      locresult = []
+      status = self.addAnalysis(s, x.morphs, locresult)
+
+      if locresult:
+        result.append(locresult)
+
+#    print result
+#    if isRoot and (not result): #by default return the whole thing
+    if True:
+      t = s, 'FINISH', len(s)
+      result.append([[t]])
+    return result
+#      self.checkTags(l)
+
+
+  #for indexing_type use: 1 - char indices (in norm), else: token indices
+  def makeIncidences(self, where, indexing_type = 2):
+    if where == 0:
+      self.output("use dbdict", where)
+    self.output("drop table incidences", where)
+#    self.output("create table incidences(term varchar(100) " +\
+    self.output("create table incidences(term varchar(100) collate utf8_bin " +\
+      "not null, dict_id int, idx int)  engine=MyISAM default charset=utf8 " +\
+      "collate=utf8_bin", where)
+    self.output("create index incidence_index on incidences(term)", where)
+
+    query = "select id, actual, norm from ohdict";
+    self.db.execute(query)
+
+    results = self.db.fetchall()
+
+    counter = 0
+    for row in results:
+      counter += 1
+      news = self.tokenize(row[1])
+      idval = int(row[0])
+      il = self.indexate(news, row[2], indexing_type)
+      for ne, nind in il:
+        self.output("insert into incidences(term, dict_id, idx) values('" + \
+          ne + "', " + str(idval) + ", " + str(nind) + ")", where)
+    if where == 1:
+      self.db.connection.commit()
+
+  def printSplits(self, s, splits):
+    print s
+    for spk, spv in splits.iteritems():
+      previndex = 0
+      spl = spk.split(',')
+      lst=[]
+      for sval in spl:
+#        print sval
+        lst.append(int(sval))
+      prl = []
+      for v in lst:
+        curindex = previndex + v
+        prl.append(s[previndex:curindex])
+        previndex = curindex
+      print "prl=", prl
+      print "\t" + "+".join(prl)
+
+  def getFinalSplits(self, word_id, s, splits, fmt):
+    result = []
+    for spi, spt in enumerate(splits):
+      previndex = 0
+      prl = []
+#      print spt
+      for se in spt:
+        curindex = previndex + se
+        prl.append(s[previndex:curindex])
+#        print "\t", previndex, curindex, s[previndex:curindex]
+        previndex = curindex
+      if fmt == 2:
+        result.append('+'.join(prl))
+      else:
+        ll = len(prl) - 1
+        for pi, pe in enumerate(prl):
+          result.append("insert into word_indices(lemma, word_id, version, word_index, islast) values ('{0}', {1}, {2}, {3}, {4});".format(pe, word_id, spi, pi, int(pi == ll)))
+
+    return result
+
+
+  #cartesin8 a list of lists
+  def cartese(self, lst): 
+    if not lst:
+      raise Exception("AA")
+    if len(lst) == 1:
+      return lst[0]
+
+#    print "CARTESE INPUT", lst
+    result = []
+    nl = self.cartese(lst[1:])
+#    print dictlist
+#    print "NL=", nl
+#    print dictlist[0]
+    for cark in lst[0]:
+      for nelt in nl:
+        loclist = cark[:]
+        loclist.extend(nelt)
+        result.append(copy.deepcopy(loclist))
+#    print "CARTESE:", result
+    return result
+
+  #format: 2: zsofi, 3: sql
+  def try2(self, fmt):
+
+    if fmt == 3:
+        print "drop table word_indices;"
+        print "create table word_indices(lemma varchar(100), word_id integer, version integer, word_index integer, islast integer);"
+        print "create index wordindices_index on word_indices(lemma);"
+
+#    query = "select id, actual, norm from ohdict where id > 1000 limit 500";
+#    query = "select id, actual, norm from ohdict where id > 3022 limit 2";
+#    query = "select id, actual, norm from ohdict order by id asc";
+#    query = "select id, actual, norm from ohdict where id > 31340 and id not in (10954, 10962, 26766, 28090, 31341, 41501, 41502, 41503, 72479, 72480, 72481, 74282, 72483, 72484) order by id asc";
+#    query = "select id, actual, norm from ohdict where id=11405"
+    query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
+#    query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
+#107148: recursion depth!
+    self.db.execute(query)
+
+    results = self.db.fetchall()
+
+    counter = 0
+    for ri, row in enumerate(results):
+      counter += 1
+      news = self.tokenize(row[1])
+      idval = int(row[0])
+      il = self.indexate(news, row[2], 2)
+      filterednorm = re.sub(self.naughtychars, '', row[2])
+      totsplits = []
+      for ne, nind in il:
+        termsplits = self.humorSplit(ne)
+
+#        self.printSplits(ne, termsplits)
+#        print termsplits
+        totsplits.append(termsplits)
+#      print totsplits
+      crtl = self.cartese(totsplits)
+#      print crtl
+      fin = self.getFinalSplits(row[0], filterednorm, crtl, fmt)
+      if fmt == 2:
+        print row[0], filterednorm
+      for qq in fin:
+        print "\t" + qq
+      if ri % 500 == 0:
+        time.sleep(1)
+
+
+if len(sys.argv) < 2:
+  print "usage: " + sys.argv[0] + " zsofi|anything_else"
+  exit()
+
+if sys.argv[1] != "zsofi":
+  exit()
+
+dumpmsgs = False
+
+oh = OHAnalyze(False)
+#oh.makeIncidences(0, 1)
+oh.fill()
+oh.try2(3)
+#oh.humorSplit("agyagtalajhumorista", ll)
+#print oh.getAllSplits("virslitad", 0)
+#print "---"
+#ll = oh.getAllSplits("agyagos", 0)
+#print ll
+#oh.humorSplit("agyagos", ll)
+#spl=oh.humorSplit("agyagtalaj", ll)
+#oh.printSplits("agyagtalaj", spl)
+oh.bye()
+
+
+
+###
+# NOTE: Number conversions!!!
+###
+if dumpmsgs:
+    print "REMINDER: have a list of banned morphemes, i.e. letters etc., test for these in getAllSplits"
+    print "2: SQL select distinct eleg gazul birja az ekezeteket. pl kerek, ke'rek, kere'k mind ua."
+    print "3 (ext. 1): hogy milyen morfemakombokat engedunk meg, sokkal bonyolultabb, mint gondolnank. pl. agrarberuhazas? agraripari?"
+    print "4: IK cimkehez kell ige is!!!. pl. fuggelek nem lehet fugg+el+ek!"
+    print "bug: agrarerdekkepviselet vegen csomo +jel"
+    print "lex7 humorize atirni"
+    print "command-line megoldas: zsofi-megoldas standard outra vs. indextabla letrehozasa az eredeti feladatra"


Property changes on: trunk/misc/osiris_xml/attic/ohanal.incidnc.py
___________________________________________________________________
Added: svn:executable
   + *

Modified: trunk/misc/osiris_xml/ohanalyze.py
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py	2013-09-17 16:04:59 UTC (rev 944)
+++ trunk/misc/osiris_xml/ohanalyze.py	2013-09-18 10:40:32 UTC (rev 945)
@@ -14,37 +14,6 @@
 
 ##########
 
-class OHTerms:
-  """A term dictionary. Downloads all terms from the db
-  """
-  
-  def __init__(self, handler):
-    self.mysqlhandler = handler
-    self.data = {}
-    self.fill()
-
-  def fill(self):
-    query = "select distinct term from incidences"
-    self.mysqlhandler.execute(query)
-    self.data = {}
-    results = self.mysqlhandler.fetchall()
-    for row in results:
-#      print row[0]
-#      self.data[row[0].encode("utf8")] = 0
-      self.data[row[0]] = 0
-#     if c == 300:
-#       print row[0].encode("utf8")
-#       print self.data
-#      if row[0].find("agyag") > -1:
-#        print row[0]
-
-  def isMember(self, s):
-    if s in self.data:
-      return True
-    return False
-
-##################
-
 class OHAnalyze:
   def __init__(self, fill = True):
     self.db = MySQLHandler()
@@ -56,10 +25,6 @@
     if fill:
       self.fill(self.db)
 
-  # FIXME test code!
-  def fill(self):
-    self.ohterms = OHTerms(self.db)
-
   def bye(self):
     if self.db != None:
       self.db.disconnect()
@@ -94,97 +59,6 @@
       result.append(sub)
     return result
 
-  def indexate(self, lst, norm, ind_type = 2):
-    """Finds an ordered set of strings in a normalized string
-       Param: the list and the normalized string
-       Returns: a list of tuples
-       Could be done differently, but it tests DB consistency somewhat.
-    """
-    mindex = 0
-    result = []
-    for li, le in enumerate(lst):
-      ind = norm.find(le, mindex)
-      if mindex == -1:
-        raise 'Problem: ' + norm
-      #convert numbers!
-  #    if l.isdigit():
-  #      t = "_NUMBER_", ind
-  #    else:
-      if ind_type == 1:
-        t = le, ind
-      else:
-        t = le, li
-      result.append(t)
-      mindex = ind + 1;
-
-#    print "indexate result=", result
-    return result
-
-
-  def string2intlist(self, s):
-    spl = s.split(',')
-    lst=[]
-    for sval in spl:
-      lst.append(int(sval))
-    return lst
-
-  def getAllSplits(self, s, level):
-    result = []
-    ll = len(s)
-    for i in range(2, ll + 1): #ignore single-letter entries
-      if self.ohterms.isMember(s[:i]):
-#        print " "*2*level + "MEMBER: " + s[:i]
-        if i == ll:
-          locresult = []
-          locresult.append(i)
-          result.append(locresult)
-#          print " "*2*level + "APPEND: " + str(locresult)
-          continue
-
-        t = self.getAllSplits(s[i:], level+1)
-        if t:
-          for resit in t:
-            locresult = []
-            locresult.append(i)
-            locresult.extend(resit)
-            result.append(locresult)
-#            print " "*2*level + "APPEND: " + str(locresult)
-#        print " "*2*level + str(t)
-#    print " "*2*level + str(result)
-    if level == 0 and not result:
-      result.append([len(s)])
-    return result
-
-  def filterTags(self, lst, extended = False):
-    killables = []
-    if extended:
-      tbl = self.splittags_ext
-    else:
-      tbl = self.splittags
-    for anali in range(0, len(lst)):
-      for lex, tag, length in lst[anali]:
-        if tag not in tbl:
-          killables.append(anali)
-          break
-#    print lst
-    for ind in reversed(killables):
-#      print "kill: ", ind
-      del lst[ind]
-    return lst
-
-  def filterTags2(self, lst, extended = False):
-    if extended:
-      tbl = self.splittags_ext
-    else:
-      tbl = self.splittags
-    for i in range(0, len(lst)):
-      lex, tag, length = lst[i]
-      if tag not in tbl:
-        return []
-
-    return lst
-
-
   def correctHumorSplit(self, lst):
     """
        Logic: sequence of morphs. if a_k in splittags: insert stack top, push
@@ -192,29 +66,6 @@
     """
     result = []
     insval = 0
-    for q1, q2, q3 in lst:
-#      if q2 not in self.splittags_ext:
-#        raise Exception("Data inconsistency: " + str(q3))
-      if q2 in self.splittags:
- #       print "appending: " + str(insval)
-        if insval > 0:
-          result.append(insval)
-        insval = q3
-      else:
-#        print "goon: " + str(q3+insval)
-        insval += q3;
-    if insval > 0:
-#      print "end: " + str(insval)
-      result.append(insval)
-    return result
-
-  def correctHumorSplit2(self, lst):
-    """
-       Logic: sequence of morphs. if a_k in splittags: insert stack top, push
-       current to stack. else, change stack top. end: insert stack top.
-    """
-    result = []
-    insval = 0
     inslemma = ''
     instag = ''
     lasttag = ''
@@ -242,17 +93,6 @@
     return result
 
 
-
-  def splitString(self, s, splitlstr):
-    result = []
-    splitlist = self.string2intlist(splitlstr)
- #   print splitlstr, splitlist
-    previndex = 0
-    for i in splitlist:
-      result.append(s[previndex:previndex + i])
-      previndex += i
-    return result
-
   def recurse(self, s, dct, startind, level = 0):
     """
       Recursively produce an analysis. Outputs to a dict.
@@ -350,7 +190,7 @@
 
   def addAnalysis(self, s, morphlist, result):
 #    print [y.lex for y in morphlist]
-    relevant = self.correctHumorSplit2(\
+    relevant = self.correctHumorSplit(\
         [(y.lex, y.tag, len(y.lex)) for y in morphlist])
     if not relevant:
       return 0
@@ -397,7 +237,6 @@
     """
     s = s.rstrip()
     h = StemmingAnalysis(s, True)
-#    print "'{0}'".format(s), ":", len(h.getAnas())
     result = []
 
     for x in h.getAnas():
@@ -407,51 +246,22 @@
       if locresult:
         result.append(locresult)
 
-#    print result
-#    if isRoot and (not result): #by default return the whole thing
     if True:
       t = s, 'FINISH', len(s)
       result.append([[t]])
+
     return result
-#      self.checkTags(l)
 
-
-  #for indexing_type use: 1 - char indices (in norm), else: token indices
-  def makeIncidences(self, where, indexing_type = 2):
-    if where == 0:
-      self.output("use dbdict", where)
-    self.output("drop table incidences", where)
-#    self.output("create table incidences(term varchar(100) " +\
-    self.output("create table incidences(term varchar(100) collate utf8_bin " +\
-      "not null, dict_id int, idx int)  engine=MyISAM default charset=utf8 " +\
-      "collate=utf8_bin", where)
-    self.output("create index incidence_index on incidences(term)", where)
-
-    query = "select id, actual, norm from ohdict";
-    self.db.execute(query)
-
-    results = self.db.fetchall()
-
-    counter = 0
-    for row in results:
-      counter += 1
-      news = self.tokenize(row[1])
-      idval = int(row[0])
-      il = self.indexate(news, row[2], indexing_type)
-      for ne, nind in il:
-        self.output("insert into incidences(term, dict_id, idx) values('" + \
-          ne + "', " + str(idval) + ", " + str(nind) + ")", where)
-    if where == 1:
-      self.db.connection.commit()
-
   def printSplits(self, s, splits):
+    """
+      Print splits of string s based on split index list splits.
+    """
     print s
     for spk, spv in splits.iteritems():
       previndex = 0
       spl = spk.split(',')
       lst=[]
       for sval in spl:
-#        print sval
         lst.append(int(sval))
       prl = []
       for v in lst:
@@ -461,23 +271,30 @@
       print "prl=", prl
       print "\t" + "+".join(prl)
 
-  def getFinalSplits(self, word_id, s, splits, fmt):
-    result = []
+  def outputSplit(self, word_id, s, splits, fmt):
+    """
+      Create output string for the splits given word_id and norm, given format
+        specifier fmt.
+    """
+    if fmt == 2:
+      result = ''
+    else:
+      result = "-- "
+    result += 'id={0}, norm="{1}"\n'.format(word_id, s)
+
     for spi, spt in enumerate(splits):
       previndex = 0
       prl = []
-#      print spt
       for se in spt:
         curindex = previndex + se
         prl.append(s[previndex:curindex])
-#        print "\t", previndex, curindex, s[previndex:curindex]
         previndex = curindex
       if fmt == 2:
-        result.append('+'.join(prl))
+        result += '\tparse {0}: "{1}"\n'.format(spi, '+'.join(prl))
       else:
         ll = len(prl) - 1
         for pi, pe in enumerate(prl):
-          result.append("insert into word_indices(lemma, word_id, version, word_index, islast) values ('{0}', {1}, {2}, {3}, {4});".format(pe, word_id, spi, pi, int(pi == ll)))
+          result += "\tinsert into word_indices(lemma, word_id, version, word_index, islast) values ('{0}', {1}, {2}, {3}, {4});\n".format(pe, word_id, spi, pi, int(pi == ll))
 
     return result
 
@@ -485,91 +302,82 @@
   #cartesin8 a list of lists
   def cartese(self, lst): 
     if not lst:
-      raise Exception("AA")
+      raise Exception("Cannot cartesinate an empty list!")
+
+    #unary lists yield single element lists
     if len(lst) == 1:
       return lst[0]
 
-#    print "CARTESE INPUT", lst
     result = []
+    #non-unary lists: for each element in current set, produce combinations
+    #with current element with all later cross products.
     nl = self.cartese(lst[1:])
-#    print dictlist
-#    print "NL=", nl
-#    print dictlist[0]
     for cark in lst[0]:
       for nelt in nl:
         loclist = cark[:]
         loclist.extend(nelt)
         result.append(copy.deepcopy(loclist))
-#    print "CARTESE:", result
+
     return result
 
+  def readDB(self):
+    query = "select id, actual, norm from ohdict where id > 1000 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc limit 100";
+    self.db.execute(query)
+
+    result = self.db.fetchall()
+    return result
+
+
   #format: 2: zsofi, 3: sql
-  def try2(self, fmt):
+  def analyze(self, oslist, fmt):
+    """
+      The main analysis function. oslist is a list of (id, actual, norm) from 
+        the db.
+    """
 
     if fmt == 3:
         print "drop table word_indices;"
         print "create table word_indices(lemma varchar(100), word_id integer, version integer, word_index integer, islast integer);"
         print "create index wordindices_index on word_indices(lemma);"
 
-#    query = "select id, actual, norm from ohdict where id > 1000 limit 500";
-#    query = "select id, actual, norm from ohdict where id > 3022 limit 2";
-#    query = "select id, actual, norm from ohdict order by id asc";
-#    query = "select id, actual, norm from ohdict where id > 31340 and id not in (10954, 10962, 26766, 28090, 31341, 41501, 41502, 41503, 72479, 72480, 72481, 74282, 72483, 72484) order by id asc";
-#    query = "select id, actual, norm from ohdict where id=11405"
-    query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
-#    query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
-#107148: recursion depth!
-    self.db.execute(query)
+    #do the following for each db row:
+    for ri, row in enumerate(oslist):
+      #tokenize the actual form
+      new_actual_l = self.tokenize(row[1])
 
-    results = self.db.fetchall()
-
-    counter = 0
-    for ri, row in enumerate(results):
-      counter += 1
-      news = self.tokenize(row[1])
-      idval = int(row[0])
-      il = self.indexate(news, row[2], 2)
+      #filter the norm for any separators we don't want
       filterednorm = re.sub(self.naughtychars, '', row[2])
+
       totsplits = []
-      for ne, nind in il:
+      for ne in new_actual_l:
+        #produce the splits (list of lists) and add to result set
         termsplits = self.humorSplit(ne)
-
-#        self.printSplits(ne, termsplits)
-#        print termsplits
         totsplits.append(termsplits)
-#      print totsplits
+
+      #now len(totsplits) = len(new_actual_l), with each term split generally
+      #having >1 splits, so we need to produce cross product
       crtl = self.cartese(totsplits)
-#      print crtl
-      fin = self.getFinalSplits(row[0], filterednorm, crtl, fmt)
-      if fmt == 2:
-        print row[0], filterednorm
-      for qq in fin:
-        print "\t" + qq
-      if ri % 500 == 0:
-        time.sleep(1)
 
+      #now just need 2 format the result, based on word_id, norm and the format
+      fin = self.outputSplit(row[0], filterednorm, crtl, fmt)
+      print fin
 
+
 if len(sys.argv) < 2:
-  print "usage: " + sys.argv[0] + " zsofi|anything_else"
+  print "usage: " + sys.argv[0] + " format (2 - Zsofi, 3 - SQL)"
   exit()
 
-if sys.argv[1] != "zsofi":
+if sys.argv[1] != "2" and sys.argv[1] != "3":
+  print 'Invalid format specifier, neer "2" or "3".'
   exit()
 
+fmt = int(sys.argv[1])
+
 dumpmsgs = False
 
 oh = OHAnalyze(False)
-#oh.makeIncidences(0, 1)
-oh.fill()
-oh.try2(3)
-#oh.humorSplit("agyagtalajhumorista", ll)
-#print oh.getAllSplits("virslitad", 0)
-#print "---"
-#ll = oh.getAllSplits("agyagos", 0)
-#print ll
-#oh.humorSplit("agyagos", ll)
-#spl=oh.humorSplit("agyagtalaj", ll)
-#oh.printSplits("agyagtalaj", spl)
+eltlist = oh.readDB()
+oh.analyze(eltlist, fmt)
 oh.bye()
 
 




More information about the Hejes-devel mailing list