[Hejes-devel] [878] hitmarker algorithm checkin, still need integration

Wed Jul 31 14:06:13 CEST 2013

Revision: 878
Author:   hussami
Date:     2013-07-31 14:06:12 +0200 (Wed, 31 Jul 2013)
Log Message:
-----------
hitmarker algorithm checkin, still need integration

Modified Paths:
--------------
    trunk/misc/osiris_xml/ohanalyze.py

Added Paths:
-----------
    trunk/misc/osiris_xml/hitmarker.py

Added: trunk/misc/osiris_xml/hitmarker.py
===================================================================

--- trunk/misc/osiris_xml/hitmarker.py	                        (rev 0)
+++ trunk/misc/osiris_xml/hitmarker.py	2013-07-31 12:06:12 UTC (rev 878)
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+
+class MarkupDict:
+    """
+        The dictionary to contain all markup info.
+        Key: indeed, a key. for text, it will be string (a word)
+        Data: a list of occurences for the key --[(term index, islast, misc)]
+    """
+
+    def __init__(self, sensitive = True):
+        self.data = {}
+        self.banned = {}
+        self.sensitive = sensitive
+
+    def clear(self):
+        self.data = {}
+        self.banned = {}
+
+    def addHit(self, key, term_id, term_index, islast, misc):
+        if key not in self.data:
+            self.data[key] = {}
+
+        if term_id not in self.data[key]:
+            self.data[key][term_id] = [(term_index, islast, misc)]
+        else:
+            self.data[key][term_id].append((term_index, islast, misc))
+
+    def dropCompleteHit(self, key):
+        if key in self.data:
+            del self.data[key]
+
+    def dropTermFromHit(self, key, term_id):
+        if key in self.data:
+            if term_id in self.data[key]:
+                del self.data[key][term_id]
+
+    def getCompleteHit(self, key):
+        if key in self.data:
+            return self.data[key]
+        if not self.sensitive:
+            return None
+        else:
+            raise Exception("No such key: " + key)
+
+    def getTermHit(self, key, term_id):
+        if key in self.data:
+            if term_id in self.data[key]:
+                return self.data[key][term_id]
+
+        if not self.sensitive:
+            return None
+        else:
+            raise Exception("No such key / term_id: " + str(key) + ", " +
+                str(term_id))
+
+    def addBanned(self, b):
+        self.banned[b] = 1
+
+    def isBanned(self, b):
+        return (b in self.banned)
+
+    def __str__(self):
+        return "Markup obj:" + str(self.data) + ", bans:" + str(self.banned)
+
+
+class HitMarker:
+    def __init__(self):
+        self.hits = MarkupDict(False)
+
+    def fill(self, lst):
+        """
+            lst must be in the form: (key, word_id, word_index, islast, misc)
+        """
+        for lk, lwd, lwi, lis, limisc in lst:
+            self.hits.addHit(lk, lwd, lwi, lis, limisc)
+
+    def lookup(self, lst):
+        """
+            lst is a list of keys
+        """
+        workdict = {}
+
+        #for all elements of the list
+        for li, le in enumerate(lst):
+            newworkdict = {}
+            #first: find all entries for this key
+            entries = self.hits.getCompleteHit(le)
+            if not entries:
+                workdict = {}
+                continue
+            print entries
+
+            for term_id, term_desc_list in entries.iteritems():
+                #banned term? forget about it
+                if term_id in self.hits.banned:
+                    continue
+                print "termid=", term_id
+
+                for term_desc in term_desc_list:
+                    #is this id already in progress?
+                    if term_id in workdict:
+                        #is this a continuation by index?
+                        if term_desc[0] == workdict[term_id][0] + 1:
+                            #is this the last entry for the term?
+                            if term_desc[1]: 
+                                print "A hit!"
+                            else:
+                                newworkdict[term_id] = (workdict[term_id][0] + 1, \
+                                    workdict[term_id][1])
+                                print "an upgrade!"
+                        else:
+                            del workdict[term_id]
+                            print "DEL"
+
+                    else: #a new sight!
+                        print "td0=", term_desc
+                        if term_desc[0] == 0: #a beginning
+                            print "okay"
+                            if term_desc[1]: #and end as well!
+                                print "hit2"
+                            else: #record new
+                                newworkdict[term_id] = (0, li)
+                                print "a newie"
+
+            workdict = newworkdict
+
+
+if __name__ == "__main__":
+    hm = HitMarker()
+
+    lst = [('cica', 15, 0, 0, 9), ('rugja', 15, 1, 0, 9), ('meg', 15, 2, 1, 9),\
+        ('cica', 100, 0, 0, 1), ('farka', 100, 1, 1, 1)]
+    hm.fill(lst)
+
+    print hm.hits
+    tox = ['cica', 'rugja', 'mar', 'meg']
+    hm.lookup(tox)

Modified: trunk/misc/osiris_xml/ohanalyze.py
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py	2013-07-30 16:01:08 UTC (rev 877)
+++ trunk/misc/osiris_xml/ohanalyze.py	2013-07-31 12:06:12 UTC (rev 878)
@@ -248,6 +248,9 @@
             starting at index startind
         2. for all analyses: correct them
     """
+    if len(s) == 0:
+      print "EXMPY"
+      return
 #    print "    Start with", s, startind, "on level", level
     hl = self.humorize(s)
 #    print "\thl=", s, hl
@@ -302,7 +305,7 @@
     """
     dct = {}
     self.recurse(s, dct, 0)
-#    print "dct=", dct
+#    print "dct=", s, dct
     lll = []
     resl = []
     self.analyzeRecursion(s, dct, 0, lll, resl)
@@ -336,13 +339,18 @@
     t1,t2,t3 = relevant[-1]
 
     if t2 == 'NOM' or t2 == 'e3':
-      print "DELLA!"
+#      print "DELLA!"
       del relevant[-1]
 
     lens = [y for _,_,y in relevant]
+
+    starred = False
+    for t1, t2, t3 in relevant:
+      if t1.find("*") != -1:
+        starred = True
     sl = sum(lens)
     ls = len(s)
-    if sl < ls:
+    if (sl != ls) and (not starred):
 #      print "DISCREPA!", s
       return 0
 
@@ -473,7 +481,7 @@
   def try2(self, where):
 
 #    query = "select id, actual, norm from ohdict where id > 1000 limit 500";
-#    query = "select id, actual, norm from ohdict where id > 1129 limit 2";
+#    query = "select id, actual, norm from ohdict where id > 3022 limit 2";
 #    query = "select id, actual, norm from ohdict order by id asc";
     query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
 #107148: recursion depth!