[Hejes-devel] [818] version control just for me

Thu Jun 13 12:16:17 CEST 2013

Revision: 818
Author:   hussami
Date:     2013-06-13 12:16:17 +0200 (Thu, 13 Jun 2013)
Log Message:
-----------
version control just for me

Modified Paths:
--------------
    trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py

Modified: trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py
===================================================================

--- trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py	2013-06-10 18:47:36 UTC (rev 817)
+++ trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py	2013-06-13 10:16:17 UTC (rev 818)
@@ -1,7 +1,12 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*- 
+# coding: utf-8
 
+import sys
 import MySQLdb
 import re
+from egybekulon2_humor import StemmingAnalysis
+from egybekulon2_humor import HumorAna
 
 class MySQLHandler:
   def __init__(self, verbose = 0):
@@ -32,16 +37,52 @@
   def fetchall(self):
     return self.cursor.fetchall()
 
+##########
 
+class OHTerms:
+  
+  def __init__(self, handler):
+    self.mysqlhandler = handler
+    self.data = {}
+    self.fill()
+
+  def fill(self):
+    query = "select distinct term from incidences"
+    self.mysqlhandler.execute(query)
+    self.data = {}
+    results = self.mysqlhandler.fetchall()
+    for row in results:
+      self.data[row[0]] = 0
+
+  def isMember(self, s):
+    if s in self.data:
+      return True
+    return False
+
+##################
+
 class OHAnalyze:
   def __init__(self):
     self.db = MySQLHandler()
     self.db.connect("localhost", "dbdicter", "dbdicter123", "dbdict")
+    self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
 
+    # FIXME test code!
+    self.ohterms = OHTerms(self.db)
+
   def bye(self):
     if self.db != None:
       self.db.disconnect()
 
+  def output(self, s, where):
+    if where == 0: #dump to text
+      print s + ";"
+    elif where == 1:
+      query = s;
+      self.db.execute(query)
+    else:
+      pass
+
   def tokenize(self, s):
     """Split a string into tokens
        Param: the target string
@@ -78,26 +119,89 @@
 
     return result
 
-  def try1(self):
-    print "use dbdict;"
-    print "drop table incidences;"
-    print "create table incidences(term varchar(100), dict_id int, " + \
-      "idx int);"
-    query = "select id, actual, norm from ohdict where id < 20";
+  def getAllSplits(self, s, level):
+    result = []
+    ll = len(s)
+    for i in range(2, ll + 1): #ignore single-letter entries
+      if self.ohterms.isMember(s[:i]):
+#        print " "*2*level + "MEMBER: " + s[:i]
+        if i == ll:
+          locresult = []
+          locresult.append(i)
+          result.append(locresult)
+#          print " "*2*level + "APPEND: " + str(locresult)
+          continue
+
+        t = self.getAllSplits(s[i:], level+1)
+        if t:
+          for resit in t:
+            locresult = []
+            locresult.append(i)
+            locresult.extend(resit)
+            result.append(locresult)
+#            print " "*2*level + "APPEND: " + str(locresult)
+#        print " "*2*level + str(t)
+#    print " "*2*level + str(result)
+    return result
+
+  def getHumorSplits(self, s):
+    h = StemmingAnalysis(s.rstrip())
+    print "'{0}'".format(s), ":", len(h.getAnas())
+    result = []
+    for x in h.getAnas():
+      locresult = []
+      tags = [y.tag for y in x.morphs]
+      forms = [y.lex for y in x.morphs]
+      if len(tags) != len(forms):
+        raise Exception(str(tags) + " vs " + str(forms))
+
+      if tags[-1] == 'NOM':
+        del forms[-1]
+        del tags[-1]
+
+      for i in range(0, len(tags)):
+        t = forms[i], tags[i]
+        locresult.append(t)
+
+      result.append(locresult)
+
+    print result
+#      self.checkTags(l)
+
+  def try1(self, where):
+    if where == 0:
+      self.output("use dbdict", where)
+    self.output("drop table incidences", where)
+    self.output("create table incidences(term varchar(100), dict_id int, " + \
+      "idx int)", where)
+    self.output("create index incidence_index on incidences(term)", where)
+
+    query = "select id, actual, norm from ohdict where id > 1000 limit 100";
     self.db.execute(query)
+
     results = self.db.fetchall()
 
+    counter = 0
     for row in results:
+      counter += 1
       news = self.tokenize(row[1])
       idval = int(row[0])
       il = self.indexate(news, row[2])
       for ne, nind in il:
-        print "insert into incidences(term, dict_id, index) values('" + \
-          ne + "', " + str(idval) + ", " + str(nind) + ");"
+        self.output("insert into incidences(term, dict_id, idx) values('" + \
+          ne + "', " + str(idval) + ", " + str(nind) + ")", where)
 
+        self.getAllSplits(ne)
 
+    if not where:
+      self.db.connection.commit()
+
+
 oh = OHAnalyze()
-oh.try1()
+#oh.try1(2)
+#oh.getAllSplits("trolibusz", 0)
+print oh.getAllSplits("agyagtalajhumorista", 0)
+print oh.getAllSplits("virslitad", 0)
 oh.bye()
 
 ###