[Hejes-devel] [1006] temporary commit, some bugfixes

hejes-devel at nytud.hu hejes-devel at nytud.hu
Sun Nov 3 15:33:07 CET 2013


Revision: 1006
Author:   hussami
Date:     2013-11-03 15:33:07 +0100 (Sun, 03 Nov 2013)
Log Message:
-----------
temporary commit, some bugfixes

Modified Paths:
--------------
    trunk/misc/osiris_xml/hitmarker.py

Added Paths:
-----------
    trunk/misc/osiris_xml/collocate.py

Added: trunk/misc/osiris_xml/collocate.py
===================================================================
--- trunk/misc/osiris_xml/collocate.py	                        (rev 0)
+++ trunk/misc/osiris_xml/collocate.py	2013-11-03 14:33:07 UTC (rev 1006)
@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+
+

Modified: trunk/misc/osiris_xml/hitmarker.py
===================================================================
--- trunk/misc/osiris_xml/hitmarker.py	2013-10-31 23:53:59 UTC (rev 1005)
+++ trunk/misc/osiris_xml/hitmarker.py	2013-11-03 14:33:07 UTC (rev 1006)
@@ -183,7 +183,7 @@
             for m2k, m2v in mv.iteritems():
                 #iterate the list of hits for this entry
                 for le in m2v:
-                    result[le[0]][le[1]] = 1
+                    result[le[0]][le[1]] = mk
 
         for le in result:
             le = sorted(le)
@@ -200,29 +200,63 @@
                 continue 
             break
 
-        for dk in sorti[curindex]:
+        maxdk_seen = -1
+
+        for dk, dv in sorti[curindex].iteritems():
+            if dk > maxdk_seen:
+                maxdk_seen = dk
+
 #            print 2*level*" ", "calling", curindex, "->", dk
-            stack.append((curindex, dk))
+            stack.append((curindex, dk, dv))
             ret= self.iterativeCombine(sorti, mem, stack, inplen, dk+1, level+1)
 
 #            print 2*level*" ", stack
             if not ret:
                 mem.append(stack[:])
             del stack[-1]
+
+        if level > 0:
+            return True
+
+        #additionally, we want to get all those meaningful combos where the
+        #curindex-th token is not included in the results
+        for i in range(curindex + 1, maxdk_seen + 1):
+            if sorti[i]:
+#                print 2*level*" ", "do more from", i
+                self.iterativeCombine(sorti, mem, stack, inplen, 
+                    i, level + 1)
+        
+
         return True
 
     def combineMarks(self, marks, inplen):
         
         sorti = self.sortMarks(marks, inplen)
-#        print sorti
+#        print "sorti", sorti
         mem = []
         stack = []
+
+        singles = {}
+
+        killme = []
+        for i in range(inplen):
+            for k, v in sorti[i].iteritems():
+                if k == i:
+                    singles[k] = v
+                    killme.append((i, k))
+        for k, v in killme:
+            del sorti[k][v]
+#        print "singles:", singles
+
         self.iterativeCombine(sorti, mem, stack, inplen, 0, 0)
         print "mem=", mem
-        return mem
 
-    def produceRetokenization(self, mem, inplen):
+        #finally, add the no-hit entry (i.e. all tokens)
+        mem.append([])
+        return mem, singles
 
+    def produceRetokenization(self, mem, singles, inplen):
+
         result = []
         for me in mem:
             loclist = []
@@ -234,12 +268,18 @@
                     break
 
                 if memindex >= len(me):
-                    loclist.extend([(v, v) for v in range(tokindex, inplen)])
-#                    print "autofill"
+                    for v in range(tokindex, inplen):
+                        if v in singles:
+                            loclist.append((v, v, singles[v]))
+                        else:
+                            loclist.append((v, v, -1))
                     break
 
                 if tokindex < me[memindex][0]:
-                    loclist.append((tokindex, tokindex))
+                    if tokindex in singles:
+                        loclist.append((tokindex, tokindex, singles[tokindex]))
+                    else:
+                        loclist.append((tokindex, tokindex, -1))
                     tokindex += 1
                     continue
                 else:
@@ -260,14 +300,22 @@
         ('cica', 100, 2, 0, 0, 1), ('cica', 100, 2, 1, 0, 1), \
         ('farka', 100, 2, 2, 1, 1), ('cica', 100, 1, 0, 0, 1), \
         ('farka', 100, 1, 1, 1, 1), ('cica', 405, 1, 0, 1, 11)]
+
+    lst = [('reti', 100, 4, 0, 0, 9), ('sas', 100, 4, 1, 1, 9),
+        ('sas', 101, 4, 0, 0, 9), ('fioka', 101, 4, 1, 1, 9),
+        ('fioka', 102, 4, 0, 0, 9), ('berlet', 102, 4, 1, 1, 9),
+        ('berlet', 103, 4, 0, 0, 9), ('dij', 103, 4, 1, 1, 9),
+        ('berlet', 113, 4, 0, 1, 9), ('fizetes', 200, 4, 0, 1, 10),
+        ('dij', 104, 4, 0, 0, 9), ('fizetes', 104, 4, 1, 1, 9)]
     hm.fill(lst)
 
 #    print hm.hits
 #    tox = ['cica', 'rugja', 'mar', 'meg']
 #    tox = ['cica', 'rugja', 'meg', 'rugja']
     tox = ['cica', 'cica', 'farka', 'cica', 'cica', 'farka']
+    tox = ['reti', 'sas', 'fioka', 'berlet', 'dij', 'fizetes']
     rr = hm.lookup(tox)
     print "result=", rr
 
-    mem = hm.combineMarks(rr, len(tox))
-    print hm.produceRetokenization(mem, len(tox))
+    mem, singles = hm.combineMarks(rr, len(tox))
+    print hm.produceRetokenization(mem, singles, len(tox))




More information about the Hejes-devel mailing list