[Hejes-devel] [989] lots of news:
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Tue Oct 15 19:45:14 CEST 2013
Revision: 989
Author: hussami
Date: 2013-10-15 19:45:14 +0200 (Tue, 15 Oct 2013)
Log Message:
-----------
lots of news:
1. combinations of possible colloc generated
2. all possible retokenizations generated (based on #1)
on dummy data, though, need to start working on integrashun.
Modified Paths:
--------------
trunk/misc/osiris_xml/hitmarker.py
Modified: trunk/misc/osiris_xml/hitmarker.py
===================================================================
--- trunk/misc/osiris_xml/hitmarker.py 2013-10-13 22:06:12 UTC (rev 988)
+++ trunk/misc/osiris_xml/hitmarker.py 2013-10-15 17:45:14 UTC (rev 989)
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from collections import OrderedDict
class MarkupDict:
"""
@@ -166,7 +167,92 @@
workdict = newworkdict
return result
+ """
+ Sorts the lookup result. Returns a list of dicts where
+ the list index corresponds to colloc beginnings
+ the dict key corresponds to colloc endings
+ the dict value is a dummy (sortedset, OrderedDict are less standard
+ than the regular dict).
+ """
+ def sortMarks(self, marks, inplen):
+ result = [{} for _ in range(inplen)]
+ #iterate per dict entry
+ for mk, mv in marks.iteritems():
+ #iterate per dict entry version
+ for m2k, m2v in mv.iteritems():
+ #iterate the list of hits for this entry
+ for le in m2v:
+ result[le[0]][le[1]] = 1
+
+ for le in result:
+ le = sorted(le)
+ return result
+
+ def iterativeCombine(self, sorti, mem, stack, inplen, curindex, level):
+
+ while True:
+ if curindex >= inplen:
+ return False
+
+ if not sorti[curindex]:
+ curindex += 1
+ continue
+ break
+
+ for dk in sorti[curindex]:
+# print 2*level*" ", "calling", curindex, "->", dk
+ stack.append((curindex, dk))
+ ret= self.iterativeCombine(sorti, mem, stack, inplen, dk+1, level+1)
+
+# print 2*level*" ", stack
+ if not ret:
+ mem.append(stack[:])
+ del stack[-1]
+ return True
+
+ def combineMarks(self, marks, inplen):
+
+ sorti = self.sortMarks(marks, inplen)
+# print sorti
+ mem = []
+ stack = []
+ self.iterativeCombine(sorti, mem, stack, inplen, 0, 0)
+ print "mem=", mem
+ return mem
+
+ def produceRetokenization(self, mem, inplen):
+
+ result = []
+ for me in mem:
+ loclist = []
+ tokindex = 0
+ memindex = 0
+
+ while True:
+ if tokindex >= inplen:
+ break
+
+ if memindex >= len(me):
+ loclist.extend([(v, v) for v in range(tokindex, inplen)])
+# print "autofill"
+ break
+
+ if tokindex < me[memindex][0]:
+ loclist.append((tokindex, tokindex))
+ tokindex += 1
+ continue
+ else:
+ loclist.append(me[memindex])
+ tokindex = me[memindex][1] + 1
+ memindex += 1
+ continue
+
+# print "LOC=", loclist
+ result.append(loclist[:])
+ return result
+
+
if __name__ == "__main__":
hm = HitMarker()
@@ -182,3 +268,6 @@
tox = ['cica', 'cica', 'farka', 'cica', 'cica', 'farka']
rr = hm.lookup(tox)
print "result=", rr
+
+ mem = hm.combineMarks(rr, len(tox))
+ print hm.produceRetokenization(mem, len(tox))
More information about the Hejes-devel
mailing list