[Hejes-devel] [819] getting there.

Fri Jun 14 14:26:57 CEST 2013

Revision: 819
Author:   hussami
Date:     2013-06-14 14:26:57 +0200 (Fri, 14 Jun 2013)
Log Message:
-----------
getting there. now have splits for each term in the actual field, just need to cartesinate them..

Modified Paths:
--------------
    trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py

Modified: trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py
===================================================================

--- trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py	2013-06-13 10:16:17 UTC (rev 818)
+++ trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py	2013-06-14 12:26:57 UTC (rev 819)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*- 
-# coding: utf-8
+"""# -*- coding: utf-8 -*- """
+# coding: utf8
 
 import sys
 import MySQLdb
@@ -19,8 +19,9 @@
     self.disconnect()
     self.cursor = None
 
-  def connect(self, server, user, pwd, db):
-    self.connection = MySQLdb.connect(server, user, pwd, db)
+  def connect(self, server, user, pwd, dbs):
+    self.connection = MySQLdb.connect(host=server, user=user, passwd=pwd, \
+      db=dbs, charset='utf8')
 
   def disconnect(self):
     if self.connection != None:
@@ -51,8 +52,16 @@
     self.mysqlhandler.execute(query)
     self.data = {}
     results = self.mysqlhandler.fetchall()
+#   c=0
     for row in results:
-      self.data[row[0]] = 0
+#     c+=1
+#      self.data[row[0]] = 0
+      self.data[row[0].encode("utf8")] = 0
+#     if c == 300:
+#       print row[0].encode("utf8")
+#       print self.data
+#      if row[0].find("agyag") > -1:
+#        print row[0]
 
   def isMember(self, s):
     if s in self.data:
@@ -142,8 +151,79 @@
 #            print " "*2*level + "APPEND: " + str(locresult)
 #        print " "*2*level + str(t)
 #    print " "*2*level + str(result)
+    if level == 0 and not result:
+      result.append([len(s)])
     return result
 
+  def filterTags(self, lst):
+    killables = []
+    for anali in range(0, len(lst)):
+      for lex, tag in lst[anali]:
+        if tag not in self.splittags:
+          killables.append(anali)
+          break
+    for ind in reversed(killables):
+      del lst[ind]
+    return lst
+
+  def humorFilter(self, s, splitindices): #filter bad splits
+    killables = []
+    for i in range(0, len(splitindices)): #for all splits
+      if len(splitindices[i]) == 1:
+        continue
+
+      previndex = 0
+      good = True
+      for j in splitindices[i]: #for all entries in the split
+        curindex = previndex + j
+#        print "  " + str(previndex), str(curindex)
+        hl = self.humorize(s[previndex:curindex])
+        if not hl:
+          killables.append(i)
+          break
+        previndex = curindex
+
+        self.filterTags(hl)
+        if not hl:
+          good = False
+          break
+      if not good:
+        killables.append(i)
+
+    for kitem in reversed(killables):
+      del splitindices[kitem]
+
+    if not splitindices:
+      splitindices = [[len(s)]]
+    return splitindices
+
+
+  def humorize(self, s):
+    h = StemmingAnalysis(s.rstrip())
+#    print "'{0}'".format(s), ":", len(h.getAnas())
+    result = []
+    for x in h.getAnas():
+      locresult = []
+      tags = [y.tag for y in x.morphs]
+      forms = [y.lex for y in x.morphs]
+      if len(tags) != len(forms):
+        raise Exception(str(tags) + " vs " + str(forms))
+
+      if tags[-1] == 'NOM':
+        del forms[-1]
+        del tags[-1]
+
+      for i in range(0, len(tags)):
+        t = forms[i], tags[i]
+        locresult.append(t)
+
+      result.append(locresult)
+
+#    print result
+    return result
+#      self.checkTags(l)
+
+
   def getHumorSplits(self, s):
     h = StemmingAnalysis(s.rstrip())
     print "'{0}'".format(s), ":", len(h.getAnas())
@@ -165,10 +245,11 @@
 
       result.append(locresult)
 
-    print result
+#    print result
 #      self.checkTags(l)
+    return result
 
-  def try1(self, where):
+  def makeIncidences(self, where):
     if where == 0:
       self.output("use dbdict", where)
     self.output("drop table incidences", where)
@@ -176,7 +257,7 @@
       "idx int)", where)
     self.output("create index incidence_index on incidences(term)", where)
 
-    query = "select id, actual, norm from ohdict where id > 1000 limit 100";
+    query = "select id, actual, norm from ohdict";
     self.db.execute(query)
 
     results = self.db.fetchall()
@@ -190,20 +271,60 @@
       for ne, nind in il:
         self.output("insert into incidences(term, dict_id, idx) values('" + \
           ne + "', " + str(idval) + ", " + str(nind) + ")", where)
+    if where == 1:
+      self.db.connection.commit()
 
-        self.getAllSplits(ne)
+  def printSplits(self, s, splits):
+    print s
+    for sp in splits:
+      previndex = 0
+      prl = []
+      for v in sp:
+        curindex = previndex + v
+        prl.append(s[previndex:curindex])
+        previndex = v
+      print "\t" + "+".join(prl)
 
-    if not where:
-      self.db.connection.commit()
+  def try2(self, where):
 
+    query = "select id, actual, norm from ohdict where id > 1000 limit 100";
+    self.db.execute(query)
 
+    results = self.db.fetchall()
+
+    counter = 0
+    for row in results:
+      counter += 1
+      news = self.tokenize(row[1])
+      idval = int(row[0])
+      il = self.indexate(news, row[2])
+      for ne, nind in il:
+#        self.getHumorSplits(ne)
+        termsplit = self.getAllSplits(ne, 0)
+#        print ne
+#        print termsplit
+        split = self.humorFilter(ne, termsplit)
+#        print split
+        self.printSplits(ne, split)
+
+
+
 oh = OHAnalyze()
-#oh.try1(2)
+#oh.makeIncidences(2)
+oh.try2(2)
 #oh.getAllSplits("trolibusz", 0)
-print oh.getAllSplits("agyagtalajhumorista", 0)
-print oh.getAllSplits("virslitad", 0)
+
+#print oh.getAllSplits("agyagtalajhumorista", 0)
+#ll=oh.getAllSplits("agyagtalajhumorista", 0)
+#oh.humorFilter("agyagtalajhumorista", ll)
+#print oh.getAllSplits("virslitad", 0)
+#print "---"
+ll = oh.getAllSplits("agyagos", 0)
+#print ll
+oh.humorFilter("agyagos", ll)
 oh.bye()
 
 ###
 # NOTE: Number conversions!!!
 ###
+print "REMINDER: have a list of banned morphemes, i.e. letters etc., test for these in getAllSplits"