[Hejes-devel] [989] lots of news:

hejes-devel at nytud.hu hejes-devel at nytud.hu
Tue Oct 15 19:45:14 CEST 2013


Revision: 989
Author:   hussami
Date:     2013-10-15 19:45:14 +0200 (Tue, 15 Oct 2013)
Log Message:
-----------
lots of news:
1. combinations of possible colloc generated
2. all possible retokenizations generated (based on #1)

on dummy data, though, need to start working on integrashun.

Modified Paths:
--------------
    trunk/misc/osiris_xml/hitmarker.py

Modified: trunk/misc/osiris_xml/hitmarker.py
===================================================================
--- trunk/misc/osiris_xml/hitmarker.py	2013-10-13 22:06:12 UTC (rev 988)
+++ trunk/misc/osiris_xml/hitmarker.py	2013-10-15 17:45:14 UTC (rev 989)
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from collections import OrderedDict
 
 class MarkupDict:
     """
@@ -166,7 +167,92 @@
             workdict = newworkdict
         return result
 
+    """
+       Sorts the lookup result. Returns a list of dicts where
+           the list index corresponds to colloc beginnings
+           the dict key corresponds to colloc endings
+           the dict value is a dummy (sortedset, OrderedDict are less standard
+               than the regular dict).
+    """
+    def sortMarks(self, marks, inplen):
+        result = [{} for _ in range(inplen)]
 
+        #iterate per dict entry
+        for mk, mv in marks.iteritems():
+            #iterate per dict entry version
+            for m2k, m2v in mv.iteritems():
+                #iterate the list of hits for this entry
+                for le in m2v:
+                    result[le[0]][le[1]] = 1
+
+        for le in result:
+            le = sorted(le)
+        return result
+
+    def iterativeCombine(self, sorti, mem, stack, inplen, curindex, level):
+        
+        while True:
+            if curindex >= inplen:
+                return False
+
+            if not sorti[curindex]:
+                curindex += 1
+                continue 
+            break
+
+        for dk in sorti[curindex]:
+#            print 2*level*" ", "calling", curindex, "->", dk
+            stack.append((curindex, dk))
+            ret= self.iterativeCombine(sorti, mem, stack, inplen, dk+1, level+1)
+
+#            print 2*level*" ", stack
+            if not ret:
+                mem.append(stack[:])
+            del stack[-1]
+        return True
+
+    def combineMarks(self, marks, inplen):
+        
+        sorti = self.sortMarks(marks, inplen)
+#        print sorti
+        mem = []
+        stack = []
+        self.iterativeCombine(sorti, mem, stack, inplen, 0, 0)
+        print "mem=", mem
+        return mem
+
+    def produceRetokenization(self, mem, inplen):
+
+        result = []
+        for me in mem:
+            loclist = []
+            tokindex = 0
+            memindex = 0
+            
+            while True:
+                if tokindex >= inplen:
+                    break
+
+                if memindex >= len(me):
+                    loclist.extend([(v, v) for v in range(tokindex, inplen)])
+#                    print "autofill"
+                    break
+
+                if tokindex < me[memindex][0]:
+                    loclist.append((tokindex, tokindex))
+                    tokindex += 1
+                    continue
+                else:
+                    loclist.append(me[memindex])
+                    tokindex = me[memindex][1] + 1
+                    memindex += 1
+                    continue
+
+#            print "LOC=", loclist
+            result.append(loclist[:])
+        return result
+
+
 if __name__ == "__main__":
     hm = HitMarker()
 
@@ -182,3 +268,6 @@
     tox = ['cica', 'cica', 'farka', 'cica', 'cica', 'farka']
     rr = hm.lookup(tox)
     print "result=", rr
+
+    mem = hm.combineMarks(rr, len(tox))
+    print hm.produceRetokenization(mem, len(tox))




More information about the Hejes-devel mailing list