[Hejes-devel] [829] dirty schmirty, but worky-worky

hejes-devel at nytud.hu hejes-devel at nytud.hu
Wed Jun 26 09:59:40 CEST 2013


Revision: 829
Author:   hussami
Date:     2013-06-26 09:59:40 +0200 (Wed, 26 Jun 2013)
Log Message:
-----------
dirty schmirty, but worky-worky

Modified Paths:
--------------
    trunk/misc/osiris_xml/ohanalyze.py

Modified: trunk/misc/osiris_xml/ohanalyze.py
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py	2013-06-25 11:48:21 UTC (rev 828)
+++ trunk/misc/osiris_xml/ohanalyze.py	2013-06-26 07:59:40 UTC (rev 829)
@@ -4,6 +4,7 @@
 
 import sys
 import MySQLdb
+import copy
 import re
 sys.path.append("../../web2py/applications/helyesiras_webdev/modules")
 from egybekulon2_humor import StemmingAnalysis
@@ -187,6 +188,19 @@
       del lst[ind]
     return lst
 
+  def filterTags2(self, lst, extended = False):
+    if extended:
+      tbl = self.splittags_ext
+    else:
+      tbl = self.splittags
+    for i in range(0, len(lst)):
+      lex, tag, length = lst[i]
+      if tag not in tbl:
+        return []
+
+    return lst
+
+
   def correctHumorSplit(self, lst):
     """
        Logic: sequence of morphs. if a_k in splittags: insert stack top, push
@@ -195,70 +209,126 @@
     result = []
     insval = 0
     for q1, q2, q3 in lst:
-      if q2 not in self.splittags_ext:
-        raise Exception("Data inconsistency: " + str(q3))
+#      if q2 not in self.splittags_ext:
+#        raise Exception("Data inconsistency: " + str(q3))
       if q2 in self.splittags:
  #       print "appending: " + str(insval)
         if insval > 0:
-          result.append(str(insval))
+          result.append(insval)
         insval = q3
       else:
 #        print "goon: " + str(q3+insval)
         insval += q3;
     if insval > 0:
 #      print "end: " + str(insval)
-      result.append(str(insval))
+      result.append(insval)
     return result
 
-  def humorSplit(self, s, splitindices): #filter bad splits
-    spdict = {str(len(s)) : 1}
-    
-    #first: process the whole string
+  def splitString(self, s, splitlstr):
+    result = []
+    splitlist = self.string2intlist(splitlstr)
+    print splitlstr, splitlist
+    previndex = 0
+    for i in splitlist:
+      result.append(s[previndex:previndex + i])
+      previndex += i
+    return result
+
+  def recurse(self, s, dct, startind):
+    print "Start with", s, startind
     hl = self.humorize(s)
-    if hl:
-#      print hl
-      hl = self.filterTags(hl, True)
-      if hl:
-#        print hl
-        for hle in hl:
-          keyl = self.correctHumorSplit(hle)
-          ss = ",".join(keyl)
-#          print ss
-          spdict[ss] = 1
+    print "hl=", s, hl
+    if startind not in dct:
+      print "insert: ", s, startind
+      dct[startind] = {len(s) : 1}
+#    if len(hl) == 1:
+#      if len(hl[0]) == 1:
+#        return
+    for hle in hl:
+      print hle
+      keyl = self.correctHumorSplit(hle)
+#      dct[s].append(keyl)
+      dct[startind][len(s)] = 1
+      pind = 0
+      for k in keyl:
+        temps = s[pind:pind + k]
+        print "temps", temps, startind, pind
+        if (startind + pind) not in dct:
+          self.recurse(temps, dct, startind + pind)
+        elif len(temps) not in dct[startind + pind]:
+          self.recurse(temps, dct, startind + pind)
+        else:
+          print "already done it!"
+        pind += k
+      print "K", keyl
 
-    #for the rest: just determine if all elements pass -- if so, add
-    for i in range(0, len(splitindices)):
-      if len(splitindices[i]) == 1: #ignore full, we've done it already
-        continue
+  def analyzeRecursion(self, s, dct, ind, lst, result):
+    l = len(s)
+    print "anal:", ind, l
+    if ind not in dct:
+      return
+    print dct[ind]
+    for k, v in dct[ind].iteritems():
+      print "?: ", k
+      if (k + ind == l):
+        print "OK!", l, ind, k
+        lst.append(k)
+        result.append(copy.deepcopy(lst))
+        del lst[-1]
+      else:
+        print "notok:", ind, k
+        lst.append(k)
+        self.analyzeRecursion(s, dct, ind + k, lst, result)
+        del lst[-1]
+        print "\tback from notok", ind
 
-      previndex = 0
-      for j in range(0, len(splitindices[i])): #for all entries in the split
-        curindex = previndex + splitindices[i][j]
-        hl = self.humorize(s[previndex:curindex])
-        if not hl: #no humor response? invalid interval, forget it
-          break
-        previndex = curindex
 
-        #ok, check if the current tag is / tags are ok
-        hl = self.filterTags(hl)
-        if not hl:
-          break
-        if j < len(splitindices[i]) - 1: #not the last elt of the split? go on
-          continue
 
-        #if we are here, that means all splits were fine
-        keyl = []
-        for qq in splitindices[i]:
-          keyl.append(str(qq))
-        ss = ','.join(ii for ii in keyl)
-#        print ss
-        spdict[ss] = 1
+    
 
-    return spdict
+  def humorSplit(self, s, splits): #filter bad splits
+    dct = {}
+    self.recurse(s, dct, 0)
+    print dct
+    lll = []
+    resl = []
+    self.analyzeRecursion(s, dct, 0, lll, resl)
+    print resl
+    print "----"
+    return resl
+    nomoresplits = {}
+    maygoon = {}
+    for split, v in splits.iteritems():
+      sl = self.splitString(s, split) #has all substrings
+      addedFull = False
 
 
+      print "r1"
+      for le in sl:
+        hl = self.humorize(le)
+        print hl
+        if hl:
+          print hl
+          for hle in hl:
+            print hle
+            keyl = self.correctHumorSplit(hle)
+            ss = ",".join(keyl)
+            print keyl
+            if len(keyl) == 1:
+              addedFull = True
+              nomoresplitsstr[len(le)] = 1
+            else:
+              maygoon[ss] = 1
+            print ss
+          if len(le) == len(s) and not addedFull:
+            nomoresplits[str(len(le))] = 1
+
+    return maygoon, nomoresplits
+
+
   def humorize(self, s):
-    h = StemmingAnalysis(s.rstrip())
+    s = s.rstrip()
+    h = StemmingAnalysis(s)
 #    print "'{0}'".format(s), ":", len(h.getAnas())
     result = []
     for x in h.getAnas():
@@ -277,41 +347,23 @@
       for i in range(0, len(tags)):
         t = forms[i], tags[i], lens[i]
         locresult.append(t)
+      print "2:", locresult
 
-      result.append(locresult)
+      if locresult:
+        locresult = self.filterTags2(locresult, True)
+      print "3:", locresult
+      if locresult:
+        result.append(locresult)
 
 #    print result
+    if not result: #by default return the whole thing
+      t = s, 'FINISH', len(s)
+      result.append([t])
+    print result
     return result
 #      self.checkTags(l)
 
 
-  def getHumorSplits(self, s):
-    h = StemmingAnalysis(s.rstrip())
-    print "'{0}'".format(s), ":", len(h.getAnas())
-    result = []
-    for x in h.getAnas():
-      locresult = []
-      tags = [y.tag for y in x.morphs]
-      forms = [y.lex for y in x.morphs]
-      lens = [len(y.lex) for y in x.morphs]
-      if len(tags) != len(forms) or len(tags) != len(lens):
-        raise Exception(str(tags) + " vs " + str(forms))
-
-      if tags[-1] == 'NOM':
-        del forms[-1]
-        del tags[-1]
-        del lens[-1]
-
-      for i in range(0, len(tags)):
-        t = forms[i], tags[i], lens[i]
-        locresult.append(t)
-
-      result.append(locresult)
-
-#    print result
-#      self.checkTags(l)
-    return result
-
   def makeIncidences(self, where):
     if where == 0:
       self.output("use dbdict", where)
@@ -368,26 +420,21 @@
     return result
 
 
-  #cartesin8 a list of dicts
-  def cartese(self, dictlist): 
-    if not dictlist:
+  #cartesin8 a list of lists
+  def cartese(self, lst): 
+    if not lst:
       raise Exception("AA")
-    if len(dictlist) == 1:
-      result = []
-      for cark, carv in dictlist[0].iteritems():
-#        result.append([cark])
-        result.append(self.string2intlist(cark))
-#      print result
-      return result
+    if len(lst) == 1:
+      return lst[0]
 
     result = []
-    nl = self.cartese(dictlist[1:])
+    nl = self.cartese(lst[1:])
 #    print dictlist
 #    print nl
 #    print dictlist[0]
-    for cark, carv in dictlist[0].iteritems():
+    for cark in lst[0]:
       for nelt in nl:
-        loclist = self.string2intlist(cark)
+        loclist = cark
         loclist.extend(nelt)
         result.append(loclist)
 #    print result
@@ -408,14 +455,18 @@
       il = self.indexate(news, row[2])
       totsplits = []
       for ne, nind in il:
-    #    aa=self.getHumorSplits(ne)
-    #    print aa
-        termsplits = self.getAllSplits(ne, 0)
+#        termsplits = self.getAllSplits(ne, 0)
 #        print ne
 #        print termsplits
-        termsplits = self.humorSplit(ne, termsplits)
+#        spdict = {str(len(ne)) : 1}
+        print "_____"
+        spdict = {'6,8' : 1}
+        termsplits = self.humorSplit(ne, spdict)
+
 #        self.printSplits(ne, termsplits)
+        print termsplits
         totsplits.append(termsplits)
+      print totsplits
       crtl = self.cartese(totsplits)
 #      print crtl
       fin = self.getFinalSplits(row[2], crtl)
@@ -428,17 +479,12 @@
 #oh.makeIncidences(1)
 oh.fill()
 oh.try2(2)
-#oh.getAllSplits("trolibusz", 0)
-
-#print oh.getAllSplits("agyagtalajhumorista", 0)
-#ll=oh.getAllSplits("agyagtalajhumorista", 0)
 #oh.humorSplit("agyagtalajhumorista", ll)
 #print oh.getAllSplits("virslitad", 0)
 #print "---"
 #ll = oh.getAllSplits("agyagos", 0)
 #print ll
 #oh.humorSplit("agyagos", ll)
-ll=oh.getAllSplits("agyagtalaj", 0)
 spl=oh.humorSplit("agyagtalaj", ll)
 oh.printSplits("agyagtalaj", spl)
 oh.bye()




More information about the Hejes-devel mailing list