[Hejes-devel] [915] 2bsure

Thu Aug 29 07:32:16 CEST 2013

Revision: 915
Author:   hussami
Date:     2013-08-29 07:32:16 +0200 (Thu, 29 Aug 2013)
Log Message:
-----------
2bsure

Modified Paths:
--------------
    trunk/misc/osiris_xml/ohanalyze.py

Modified: trunk/misc/osiris_xml/ohanalyze.py
===================================================================

--- trunk/misc/osiris_xml/ohanalyze.py	2013-08-28 16:36:39 UTC (rev 914)
+++ trunk/misc/osiris_xml/ohanalyze.py	2013-08-29 05:32:16 UTC (rev 915)
@@ -3,6 +3,7 @@
 # coding: utf8
 
 import sys
+import time
 import copy
 import re
 sys.path.append("../../web2py/applications/helyesiras_webdev/modules")
@@ -51,6 +52,7 @@
 #    self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
     self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
     self.splittags_ext = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3, '_IKEP': 4}
+    self.naughtychars = ' |;|-|/' #the separators we'll observe
     if fill:
       self.fill(self.db)
 
@@ -76,7 +78,14 @@
        Param: the target string
        Returns: the result list
     """
-    l = re.split(' |;|-', s)
+#    l = re.split(' |;|-|/', s)
+    l = re.split(self.naughtychars, s)
+    l2 = []
+    for le in l:
+      if le:
+        l2.append(le)
+    l = l2
+#    print l
     result = []
     for sub in l:
 #      if sub.isdigit():
@@ -111,6 +120,7 @@
 #    print "indexate result=", result
     return result
 
+
   def string2intlist(self, s):
     spl = s.split(',')
     lst=[]
@@ -252,7 +262,8 @@
         2. for all analyses: correct them
     """
     if len(s) == 0:
-      print "EXMPY"
+#      dct[startind] = {len(s) : 1}
+      print "EXMPY", dct
       return
 #    print "    Start with", s, startind, "on level", level
     hl = self.humorize(s)
@@ -281,6 +292,7 @@
   def analyzeRecursion(self, s, dct, ind, lst, result):
     l = len(s)
 #    print "anal:", ind, l
+#    print dct
     if ind not in dct:
       return
 #    print dct[ind]
@@ -337,8 +349,11 @@
       result.append(lst)
 
   def addAnalysis(self, s, morphlist, result):
+#    print [y.lex for y in morphlist]
     relevant = self.correctHumorSplit2(\
         [(y.lex, y.tag, len(y.lex)) for y in morphlist])
+    if not relevant:
+      return 0
     t1,t2,t3 = relevant[-1]
 
     if t2 == 'NOM' or t2 == 'e3':
@@ -491,21 +506,30 @@
   #format: 2: zsofi, 3: sql
   def try2(self, fmt):
 
+    if fmt == 3:
+        print "drop table word_indices;"
+        print "create table word_indices(lemma varchar(100), word_id integer, version integer, word_index integer, islast integer);"
+        print "create index wordindices_index on word_indices(lemma);"
+
 #    query = "select id, actual, norm from ohdict where id > 1000 limit 500";
 #    query = "select id, actual, norm from ohdict where id > 3022 limit 2";
 #    query = "select id, actual, norm from ohdict order by id asc";
+#    query = "select id, actual, norm from ohdict where id > 31340 and id not in (10954, 10962, 26766, 28090, 31341, 41501, 41502, 41503, 72479, 72480, 72481, 74282, 72483, 72484) order by id asc";
+#    query = "select id, actual, norm from ohdict where id=11405"
     query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
+#    query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
 #107148: recursion depth!
     self.db.execute(query)
 
     results = self.db.fetchall()
 
     counter = 0
-    for row in results:
+    for ri, row in enumerate(results):
       counter += 1
       news = self.tokenize(row[1])
       idval = int(row[0])
       il = self.indexate(news, row[2], 2)
+      filterednorm = re.sub(self.naughtychars, '', row[2])
       totsplits = []
       for ne, nind in il:
         termsplits = self.humorSplit(ne)
@@ -516,10 +540,13 @@
 #      print totsplits
       crtl = self.cartese(totsplits)
 #      print crtl
-      fin = self.getFinalSplits(row[0], row[2], crtl, fmt)
-      print row[0], row[2]
+      fin = self.getFinalSplits(row[0], filterednorm, crtl, fmt)
+      if fmt == 2:
+        print row[0], filterednorm
       for qq in fin:
         print "\t" + qq
+      if ri % 500 == 0:
+        time.sleep(1)
 
 
 if len(sys.argv) < 2: