[Hejes-devel] [892] code beautificashun in hitmarker, index table creation in ohanalyze

Sun Aug 4 22:14:45 CEST 2013

Revision: 892
Author:   hussami
Date:     2013-08-04 22:14:44 +0200 (Sun, 04 Aug 2013)
Log Message:
-----------
code beautificashun in hitmarker, index table creation in ohanalyze

Modified Paths:
--------------
    trunk/misc/osiris_xml/hitmarker.py
    trunk/misc/osiris_xml/ohanalyze.py

Modified: trunk/misc/osiris_xml/hitmarker.py
===================================================================

--- trunk/misc/osiris_xml/hitmarker.py	2013-08-04 18:51:24 UTC (rev 891)
+++ trunk/misc/osiris_xml/hitmarker.py	2013-08-04 20:14:44 UTC (rev 892)
@@ -86,7 +86,28 @@
             hits[term_id][version] = []
         hits[term_id][version].append((begin, end))
 
+    def addWDEntry(self, dct, term_id, version, cur_index, self_index, start):
+        if term_id not in dct:
+            dct[term_id] = {}
 
+        if version not in dct[term_id]:
+            dct[term_id][version] = {}
+
+        dct[term_id][version][cur_index] = (self_index, start)
+
+    def isContinuation(self, dct, term_id, version, cur_index):
+        if term_id not in dct:
+            return False
+
+        if version not in dct[term_id]:
+            return False
+
+        if (cur_index - 1) not in dct[term_id][version]:
+            return False
+
+        return True
+
+
     def lookup(self, lst):
         """
             lst is a list of keys
@@ -96,71 +117,52 @@
 
         #for all elements of the list
         for li, le in enumerate(lst):
-            print "\twd at ", li, " out of ", len(lst), " is ", workdict
             newworkdict = {}
             #first: find all entries for this key
             entries = self.hits.getCompleteHit(le)
             if not entries:
                 workdict = {}
                 continue
-            print entries
 
             for term_id, term_desc_list in entries.iteritems():
                 #banned term? forget about it
                 if term_id in self.hits.banned:
                     continue
-                print "termid=", term_id
 
                 for term_desc in term_desc_list:
                     version = term_desc[0]
-                    print "\tnow looking at version: " + str(version), term_desc[1]
+
                     #is this id already in progress?
-                    cont_branch = (term_id in workdict)
-                    if cont_branch:
-                        cont_branch = version in workdict[term_id]
-                    if cont_branch:
-                        cont_branch = (term_desc[1] - 1) in \
-                            workdict[term_id][version]
+                    cont_branch = self.isContinuation(workdict, term_id, 
+                        version, term_desc[1])
 
                     #if it is in progress, see if we can now upgrade or exit
                     if cont_branch:
                         workentry = workdict[term_id][version][term_desc[1] - 1]
+
                         #is this a continuation by index?
                         if term_desc[1] == \
                             workentry[0] + 1:
+
                             #is this the last entry for the term?
                             if term_desc[2]: 
-                                print "A hit!"
                                 self.recordHit(result, term_id, version,
                                     workentry[1], li)
+
                             else:
-                                if term_id not in newworkdict:
-                                    newworkdict[term_id] = {}
-                                if version not in newworkdict[term_id]:
-                                    newworkdict[term_id][version] = {}
-                                newworkdict[term_id][version][term_desc[1]] = (
-                                    workentry[0] + 1, workentry[1])
-                                print "an upgrade!"
-                        else:
-#                            del workdict[term_id]
-                            print "This did not match"
+                                self.addWDEntry(newworkdict, term_id, version,
+                                    term_desc[1], workentry[0]+1, workentry[1])
 
                     #anyway, we should check if it's a new entry
-                    print "td1=", term_desc[1]
                     if term_desc[1] == 0: #a beginning
+
                         if term_desc[2]: #and end as well!
-                            print "hit2"
                             self.recordHit(result, term_id, version, li, li)
+
                         else: #record new
-                            if term_id not in newworkdict:
-                                newworkdict[term_id] = {}
+                            self.addWDEntry(newworkdict, term_id, version, 
+                                0, 0, li)
 
-                            if version not in newworkdict[term_id]:
-                                newworkdict[term_id][version] = {}
-
-                            newworkdict[term_id][version][0] = (0, li)
-                            print "a newie"
-
             workdict = newworkdict
         return result
 
@@ -174,10 +176,9 @@
         ('farka', 100, 1, 1, 1, 1), ('cica', 405, 1, 0, 1, 11)]
     hm.fill(lst)
 
-    print hm.hits
+#    print hm.hits
 #    tox = ['cica', 'rugja', 'mar', 'meg']
 #    tox = ['cica', 'rugja', 'meg', 'rugja']
     tox = ['cica', 'cica', 'farka', 'cica', 'cica', 'farka']
-    print "lento",len(tox)
     rr = hm.lookup(tox)
     print "result=", rr

Modified: trunk/misc/osiris_xml/ohanalyze.py
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py	2013-08-04 18:51:24 UTC (rev 891)
+++ trunk/misc/osiris_xml/ohanalyze.py	2013-08-04 20:14:44 UTC (rev 892)
@@ -85,7 +85,7 @@
       result.append(sub)
     return result
 
-  def indexate(self, lst, norm):
+  def indexate(self, lst, norm, ind_type = 2):
     """Finds an ordered set of strings in a normalized string
        Param: the list and the normalized string
        Returns: a list of tuples
@@ -93,15 +93,18 @@
     """
     mindex = 0
     result = []
-    for l in lst:
-      ind = norm.find(l, mindex)
+    for li, le in enumerate(lst):
+      ind = norm.find(le, mindex)
       if mindex == -1:
         raise 'Problem: ' + norm
       #convert numbers!
   #    if l.isdigit():
   #      t = "_NUMBER_", ind
   #    else:
-      t = l, ind
+      if ind_type == 1:
+        t = le, ind
+      else:
+        t = le, li
       result.append(t)
       mindex = ind + 1;
 
@@ -398,7 +401,8 @@
 #      self.checkTags(l)
 
 
-  def makeIncidences(self, where):
+  #for indexing_type use: 1 - char indices (in norm), else: token indices
+  def makeIncidences(self, where, indexing_type = 2):
     if where == 0:
       self.output("use dbdict", where)
     self.output("drop table incidences", where)
@@ -418,7 +422,7 @@
       counter += 1
       news = self.tokenize(row[1])
       idval = int(row[0])
-      il = self.indexate(news, row[2])
+      il = self.indexate(news, row[2], indexing_type)
       for ne, nind in il:
         self.output("insert into incidences(term, dict_id, idx) values('" + \
           ne + "', " + str(idval) + ", " + str(nind) + ")", where)
@@ -442,9 +446,9 @@
       print "prl=", prl
       print "\t" + "+".join(prl)
 
-  def getFinalSplits(self, s, splits):
+  def getFinalSplits(self, word_id, s, splits, fmt):
     result = []
-    for spt in splits:
+    for spi, spt in enumerate(splits):
       previndex = 0
       prl = []
 #      print spt
@@ -453,7 +457,13 @@
         prl.append(s[previndex:curindex])
 #        print "\t", previndex, curindex, s[previndex:curindex]
         previndex = curindex
-      result.append('+'.join(prl))
+      if fmt == 2:
+        result.append('+'.join(prl))
+      else:
+        ll = len(prl) - 1
+        for pi, pe in enumerate(prl):
+          result.append("insert into word_indices(lemma, word_id, version, word_index, islast) values ('{0}', {1}, {2}, {3}, {4});".format(pe, word_id, spi, pi, int(pi == ll)))
+
     return result
 
 
@@ -478,7 +488,8 @@
 #    print "CARTESE:", result
     return result
 
-  def try2(self, where):
+  #format: 2: zsofi, 3: sql
+  def try2(self, fmt):
 
 #    query = "select id, actual, norm from ohdict where id > 1000 limit 500";
 #    query = "select id, actual, norm from ohdict where id > 3022 limit 2";
@@ -494,7 +505,7 @@
       counter += 1
       news = self.tokenize(row[1])
       idval = int(row[0])
-      il = self.indexate(news, row[2])
+      il = self.indexate(news, row[2], 2)
       totsplits = []
       for ne, nind in il:
         termsplits = self.humorSplit(ne)
@@ -505,7 +516,7 @@
 #      print totsplits
       crtl = self.cartese(totsplits)
 #      print crtl
-      fin = self.getFinalSplits(row[2], crtl)
+      fin = self.getFinalSplits(row[0], row[2], crtl, fmt)
       print row[0], row[2]
       for qq in fin:
         print "\t" + qq
@@ -521,9 +532,9 @@
 dumpmsgs = False
 
 oh = OHAnalyze(False)
-#oh.makeIncidences(1)
+#oh.makeIncidences(0, 1)
 oh.fill()
-oh.try2(2)
+oh.try2(3)
 #oh.humorSplit("agyagtalajhumorista", ll)
 #print oh.getAllSplits("virslitad", 0)
 #print "---"