[Hejes-devel] [1006] temporary commit, some bugfixes
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Sun Nov 3 15:33:07 CET 2013
Revision: 1006
Author: hussami
Date: 2013-11-03 15:33:07 +0100 (Sun, 03 Nov 2013)
Log Message:
-----------
temporary commit, some bugfixes
Modified Paths:
--------------
trunk/misc/osiris_xml/hitmarker.py
Added Paths:
-----------
trunk/misc/osiris_xml/collocate.py
Added: trunk/misc/osiris_xml/collocate.py
===================================================================
--- trunk/misc/osiris_xml/collocate.py (rev 0)
+++ trunk/misc/osiris_xml/collocate.py 2013-11-03 14:33:07 UTC (rev 1006)
@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+
+
Modified: trunk/misc/osiris_xml/hitmarker.py
===================================================================
--- trunk/misc/osiris_xml/hitmarker.py 2013-10-31 23:53:59 UTC (rev 1005)
+++ trunk/misc/osiris_xml/hitmarker.py 2013-11-03 14:33:07 UTC (rev 1006)
@@ -183,7 +183,7 @@
for m2k, m2v in mv.iteritems():
#iterate the list of hits for this entry
for le in m2v:
- result[le[0]][le[1]] = 1
+ result[le[0]][le[1]] = mk
for le in result:
le = sorted(le)
@@ -200,29 +200,63 @@
continue
break
- for dk in sorti[curindex]:
+ maxdk_seen = -1
+
+ for dk, dv in sorti[curindex].iteritems():
+ if dk > maxdk_seen:
+ maxdk_seen = dk
+
# print 2*level*" ", "calling", curindex, "->", dk
- stack.append((curindex, dk))
+ stack.append((curindex, dk, dv))
ret= self.iterativeCombine(sorti, mem, stack, inplen, dk+1, level+1)
# print 2*level*" ", stack
if not ret:
mem.append(stack[:])
del stack[-1]
+
+ if level > 0:
+ return True
+
+ #additionally, we want to get all those meaningful combos where the
+ #curindex-th token is not included in the results
+ for i in range(curindex + 1, maxdk_seen + 1):
+ if sorti[i]:
+# print 2*level*" ", "do more from", i
+ self.iterativeCombine(sorti, mem, stack, inplen,
+ i, level + 1)
+
+
return True
def combineMarks(self, marks, inplen):
sorti = self.sortMarks(marks, inplen)
-# print sorti
+# print "sorti", sorti
mem = []
stack = []
+
+ singles = {}
+
+ killme = []
+ for i in range(inplen):
+ for k, v in sorti[i].iteritems():
+ if k == i:
+ singles[k] = v
+ killme.append((i, k))
+ for k, v in killme:
+ del sorti[k][v]
+# print "singles:", singles
+
self.iterativeCombine(sorti, mem, stack, inplen, 0, 0)
print "mem=", mem
- return mem
- def produceRetokenization(self, mem, inplen):
+ #finally, add the no-hit entry (i.e. all tokens)
+ mem.append([])
+ return mem, singles
+ def produceRetokenization(self, mem, singles, inplen):
+
result = []
for me in mem:
loclist = []
@@ -234,12 +268,18 @@
break
if memindex >= len(me):
- loclist.extend([(v, v) for v in range(tokindex, inplen)])
-# print "autofill"
+ for v in range(tokindex, inplen):
+ if v in singles:
+ loclist.append((v, v, singles[v]))
+ else:
+ loclist.append((v, v, -1))
break
if tokindex < me[memindex][0]:
- loclist.append((tokindex, tokindex))
+ if tokindex in singles:
+ loclist.append((tokindex, tokindex, singles[tokindex]))
+ else:
+ loclist.append((tokindex, tokindex, -1))
tokindex += 1
continue
else:
@@ -260,14 +300,22 @@
('cica', 100, 2, 0, 0, 1), ('cica', 100, 2, 1, 0, 1), \
('farka', 100, 2, 2, 1, 1), ('cica', 100, 1, 0, 0, 1), \
('farka', 100, 1, 1, 1, 1), ('cica', 405, 1, 0, 1, 11)]
+
+ lst = [('reti', 100, 4, 0, 0, 9), ('sas', 100, 4, 1, 1, 9),
+ ('sas', 101, 4, 0, 0, 9), ('fioka', 101, 4, 1, 1, 9),
+ ('fioka', 102, 4, 0, 0, 9), ('berlet', 102, 4, 1, 1, 9),
+ ('berlet', 103, 4, 0, 0, 9), ('dij', 103, 4, 1, 1, 9),
+ ('berlet', 113, 4, 0, 1, 9), ('fizetes', 200, 4, 0, 1, 10),
+ ('dij', 104, 4, 0, 0, 9), ('fizetes', 104, 4, 1, 1, 9)]
hm.fill(lst)
# print hm.hits
# tox = ['cica', 'rugja', 'mar', 'meg']
# tox = ['cica', 'rugja', 'meg', 'rugja']
tox = ['cica', 'cica', 'farka', 'cica', 'cica', 'farka']
+ tox = ['reti', 'sas', 'fioka', 'berlet', 'dij', 'fizetes']
rr = hm.lookup(tox)
print "result=", rr
- mem = hm.combineMarks(rr, len(tox))
- print hm.produceRetokenization(mem, len(tox))
+ mem, singles = hm.combineMarks(rr, len(tox))
+ print hm.produceRetokenization(mem, singles, len(tox))
More information about the Hejes-devel
mailing list