[Hejes-devel] [829] dirty schmirty, but worky-worky
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Wed Jun 26 09:59:40 CEST 2013
Revision: 829
Author: hussami
Date: 2013-06-26 09:59:40 +0200 (Wed, 26 Jun 2013)
Log Message:
-----------
dirty schmirty, but worky-worky
Modified Paths:
--------------
trunk/misc/osiris_xml/ohanalyze.py
Modified: trunk/misc/osiris_xml/ohanalyze.py
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py 2013-06-25 11:48:21 UTC (rev 828)
+++ trunk/misc/osiris_xml/ohanalyze.py 2013-06-26 07:59:40 UTC (rev 829)
@@ -4,6 +4,7 @@
import sys
import MySQLdb
+import copy
import re
sys.path.append("../../web2py/applications/helyesiras_webdev/modules")
from egybekulon2_humor import StemmingAnalysis
@@ -187,6 +188,19 @@
del lst[ind]
return lst
+ def filterTags2(self, lst, extended = False):
+ if extended:
+ tbl = self.splittags_ext
+ else:
+ tbl = self.splittags
+ for i in range(0, len(lst)):
+ lex, tag, length = lst[i]
+ if tag not in tbl:
+ return []
+
+ return lst
+
+
def correctHumorSplit(self, lst):
"""
Logic: sequence of morphs. if a_k in splittags: insert stack top, push
@@ -195,70 +209,126 @@
result = []
insval = 0
for q1, q2, q3 in lst:
- if q2 not in self.splittags_ext:
- raise Exception("Data inconsistency: " + str(q3))
+# if q2 not in self.splittags_ext:
+# raise Exception("Data inconsistency: " + str(q3))
if q2 in self.splittags:
# print "appending: " + str(insval)
if insval > 0:
- result.append(str(insval))
+ result.append(insval)
insval = q3
else:
# print "goon: " + str(q3+insval)
insval += q3;
if insval > 0:
# print "end: " + str(insval)
- result.append(str(insval))
+ result.append(insval)
return result
- def humorSplit(self, s, splitindices): #filter bad splits
- spdict = {str(len(s)) : 1}
-
- #first: process the whole string
+ def splitString(self, s, splitlstr):
+ result = []
+ splitlist = self.string2intlist(splitlstr)
+ print splitlstr, splitlist
+ previndex = 0
+ for i in splitlist:
+ result.append(s[previndex:previndex + i])
+ previndex += i
+ return result
+
+ def recurse(self, s, dct, startind):
+ print "Start with", s, startind
hl = self.humorize(s)
- if hl:
-# print hl
- hl = self.filterTags(hl, True)
- if hl:
-# print hl
- for hle in hl:
- keyl = self.correctHumorSplit(hle)
- ss = ",".join(keyl)
-# print ss
- spdict[ss] = 1
+ print "hl=", s, hl
+ if startind not in dct:
+ print "insert: ", s, startind
+ dct[startind] = {len(s) : 1}
+# if len(hl) == 1:
+# if len(hl[0]) == 1:
+# return
+ for hle in hl:
+ print hle
+ keyl = self.correctHumorSplit(hle)
+# dct[s].append(keyl)
+ dct[startind][len(s)] = 1
+ pind = 0
+ for k in keyl:
+ temps = s[pind:pind + k]
+ print "temps", temps, startind, pind
+ if (startind + pind) not in dct:
+ self.recurse(temps, dct, startind + pind)
+ elif len(temps) not in dct[startind + pind]:
+ self.recurse(temps, dct, startind + pind)
+ else:
+ print "already done it!"
+ pind += k
+ print "K", keyl
- #for the rest: just determine if all elements pass -- if so, add
- for i in range(0, len(splitindices)):
- if len(splitindices[i]) == 1: #ignore full, we've done it already
- continue
+ def analyzeRecursion(self, s, dct, ind, lst, result):
+ l = len(s)
+ print "anal:", ind, l
+ if ind not in dct:
+ return
+ print dct[ind]
+ for k, v in dct[ind].iteritems():
+ print "?: ", k
+ if (k + ind == l):
+ print "OK!", l, ind, k
+ lst.append(k)
+ result.append(copy.deepcopy(lst))
+ del lst[-1]
+ else:
+ print "notok:", ind, k
+ lst.append(k)
+ self.analyzeRecursion(s, dct, ind + k, lst, result)
+ del lst[-1]
+ print "\tback from notok", ind
- previndex = 0
- for j in range(0, len(splitindices[i])): #for all entries in the split
- curindex = previndex + splitindices[i][j]
- hl = self.humorize(s[previndex:curindex])
- if not hl: #no humor response? invalid interval, forget it
- break
- previndex = curindex
- #ok, check if the current tag is / tags are ok
- hl = self.filterTags(hl)
- if not hl:
- break
- if j < len(splitindices[i]) - 1: #not the last elt of the split? go on
- continue
- #if we are here, that means all splits were fine
- keyl = []
- for qq in splitindices[i]:
- keyl.append(str(qq))
- ss = ','.join(ii for ii in keyl)
-# print ss
- spdict[ss] = 1
+
- return spdict
+ def humorSplit(self, s, splits): #filter bad splits
+ dct = {}
+ self.recurse(s, dct, 0)
+ print dct
+ lll = []
+ resl = []
+ self.analyzeRecursion(s, dct, 0, lll, resl)
+ print resl
+ print "----"
+ return resl
+ nomoresplits = {}
+ maygoon = {}
+ for split, v in splits.iteritems():
+ sl = self.splitString(s, split) #has all substrings
+ addedFull = False
+ print "r1"
+ for le in sl:
+ hl = self.humorize(le)
+ print hl
+ if hl:
+ print hl
+ for hle in hl:
+ print hle
+ keyl = self.correctHumorSplit(hle)
+ ss = ",".join(keyl)
+ print keyl
+ if len(keyl) == 1:
+ addedFull = True
+ nomoresplitsstr[len(le)] = 1
+ else:
+ maygoon[ss] = 1
+ print ss
+ if len(le) == len(s) and not addedFull:
+ nomoresplits[str(len(le))] = 1
+
+ return maygoon, nomoresplits
+
+
def humorize(self, s):
- h = StemmingAnalysis(s.rstrip())
+ s = s.rstrip()
+ h = StemmingAnalysis(s)
# print "'{0}'".format(s), ":", len(h.getAnas())
result = []
for x in h.getAnas():
@@ -277,41 +347,23 @@
for i in range(0, len(tags)):
t = forms[i], tags[i], lens[i]
locresult.append(t)
+ print "2:", locresult
- result.append(locresult)
+ if locresult:
+ locresult = self.filterTags2(locresult, True)
+ print "3:", locresult
+ if locresult:
+ result.append(locresult)
# print result
+ if not result: #by default return the whole thing
+ t = s, 'FINISH', len(s)
+ result.append([t])
+ print result
return result
# self.checkTags(l)
- def getHumorSplits(self, s):
- h = StemmingAnalysis(s.rstrip())
- print "'{0}'".format(s), ":", len(h.getAnas())
- result = []
- for x in h.getAnas():
- locresult = []
- tags = [y.tag for y in x.morphs]
- forms = [y.lex for y in x.morphs]
- lens = [len(y.lex) for y in x.morphs]
- if len(tags) != len(forms) or len(tags) != len(lens):
- raise Exception(str(tags) + " vs " + str(forms))
-
- if tags[-1] == 'NOM':
- del forms[-1]
- del tags[-1]
- del lens[-1]
-
- for i in range(0, len(tags)):
- t = forms[i], tags[i], lens[i]
- locresult.append(t)
-
- result.append(locresult)
-
-# print result
-# self.checkTags(l)
- return result
-
def makeIncidences(self, where):
if where == 0:
self.output("use dbdict", where)
@@ -368,26 +420,21 @@
return result
- #cartesin8 a list of dicts
- def cartese(self, dictlist):
- if not dictlist:
+ #cartesin8 a list of lists
+ def cartese(self, lst):
+ if not lst:
raise Exception("AA")
- if len(dictlist) == 1:
- result = []
- for cark, carv in dictlist[0].iteritems():
-# result.append([cark])
- result.append(self.string2intlist(cark))
-# print result
- return result
+ if len(lst) == 1:
+ return lst[0]
result = []
- nl = self.cartese(dictlist[1:])
+ nl = self.cartese(lst[1:])
# print dictlist
# print nl
# print dictlist[0]
- for cark, carv in dictlist[0].iteritems():
+ for cark in lst[0]:
for nelt in nl:
- loclist = self.string2intlist(cark)
+ loclist = cark
loclist.extend(nelt)
result.append(loclist)
# print result
@@ -408,14 +455,18 @@
il = self.indexate(news, row[2])
totsplits = []
for ne, nind in il:
- # aa=self.getHumorSplits(ne)
- # print aa
- termsplits = self.getAllSplits(ne, 0)
+# termsplits = self.getAllSplits(ne, 0)
# print ne
# print termsplits
- termsplits = self.humorSplit(ne, termsplits)
+# spdict = {str(len(ne)) : 1}
+ print "_____"
+ spdict = {'6,8' : 1}
+ termsplits = self.humorSplit(ne, spdict)
+
# self.printSplits(ne, termsplits)
+ print termsplits
totsplits.append(termsplits)
+ print totsplits
crtl = self.cartese(totsplits)
# print crtl
fin = self.getFinalSplits(row[2], crtl)
@@ -428,17 +479,12 @@
#oh.makeIncidences(1)
oh.fill()
oh.try2(2)
-#oh.getAllSplits("trolibusz", 0)
-
-#print oh.getAllSplits("agyagtalajhumorista", 0)
-#ll=oh.getAllSplits("agyagtalajhumorista", 0)
#oh.humorSplit("agyagtalajhumorista", ll)
#print oh.getAllSplits("virslitad", 0)
#print "---"
#ll = oh.getAllSplits("agyagos", 0)
#print ll
#oh.humorSplit("agyagos", ll)
-ll=oh.getAllSplits("agyagtalaj", 0)
spl=oh.humorSplit("agyagtalaj", ll)
oh.printSplits("agyagtalaj", spl)
oh.bye()
More information about the Hejes-devel
mailing list