[Hejes-devel] [892] code beautificashun in hitmarker, index table creation in ohanalyze
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Sun Aug 4 22:14:45 CEST 2013
Revision: 892
Author: hussami
Date: 2013-08-04 22:14:44 +0200 (Sun, 04 Aug 2013)
Log Message:
-----------
code beautificashun in hitmarker, index table creation in ohanalyze
Modified Paths:
--------------
trunk/misc/osiris_xml/hitmarker.py
trunk/misc/osiris_xml/ohanalyze.py
Modified: trunk/misc/osiris_xml/hitmarker.py
===================================================================
--- trunk/misc/osiris_xml/hitmarker.py 2013-08-04 18:51:24 UTC (rev 891)
+++ trunk/misc/osiris_xml/hitmarker.py 2013-08-04 20:14:44 UTC (rev 892)
@@ -86,7 +86,28 @@
hits[term_id][version] = []
hits[term_id][version].append((begin, end))
+ def addWDEntry(self, dct, term_id, version, cur_index, self_index, start):
+ if term_id not in dct:
+ dct[term_id] = {}
+ if version not in dct[term_id]:
+ dct[term_id][version] = {}
+
+ dct[term_id][version][cur_index] = (self_index, start)
+
+ def isContinuation(self, dct, term_id, version, cur_index):
+ if term_id not in dct:
+ return False
+
+ if version not in dct[term_id]:
+ return False
+
+ if (cur_index - 1) not in dct[term_id][version]:
+ return False
+
+ return True
+
+
def lookup(self, lst):
"""
lst is a list of keys
@@ -96,71 +117,52 @@
#for all elements of the list
for li, le in enumerate(lst):
- print "\twd at ", li, " out of ", len(lst), " is ", workdict
newworkdict = {}
#first: find all entries for this key
entries = self.hits.getCompleteHit(le)
if not entries:
workdict = {}
continue
- print entries
for term_id, term_desc_list in entries.iteritems():
#banned term? forget about it
if term_id in self.hits.banned:
continue
- print "termid=", term_id
for term_desc in term_desc_list:
version = term_desc[0]
- print "\tnow looking at version: " + str(version), term_desc[1]
+
#is this id already in progress?
- cont_branch = (term_id in workdict)
- if cont_branch:
- cont_branch = version in workdict[term_id]
- if cont_branch:
- cont_branch = (term_desc[1] - 1) in \
- workdict[term_id][version]
+ cont_branch = self.isContinuation(workdict, term_id,
+ version, term_desc[1])
#if it is in progress, see if we can now upgrade or exit
if cont_branch:
workentry = workdict[term_id][version][term_desc[1] - 1]
+
#is this a continuation by index?
if term_desc[1] == \
workentry[0] + 1:
+
#is this the last entry for the term?
if term_desc[2]:
- print "A hit!"
self.recordHit(result, term_id, version,
workentry[1], li)
+
else:
- if term_id not in newworkdict:
- newworkdict[term_id] = {}
- if version not in newworkdict[term_id]:
- newworkdict[term_id][version] = {}
- newworkdict[term_id][version][term_desc[1]] = (
- workentry[0] + 1, workentry[1])
- print "an upgrade!"
- else:
-# del workdict[term_id]
- print "This did not match"
+ self.addWDEntry(newworkdict, term_id, version,
+ term_desc[1], workentry[0]+1, workentry[1])
#anyway, we should check if it's a new entry
- print "td1=", term_desc[1]
if term_desc[1] == 0: #a beginning
+
if term_desc[2]: #and end as well!
- print "hit2"
self.recordHit(result, term_id, version, li, li)
+
else: #record new
- if term_id not in newworkdict:
- newworkdict[term_id] = {}
+ self.addWDEntry(newworkdict, term_id, version,
+ 0, 0, li)
- if version not in newworkdict[term_id]:
- newworkdict[term_id][version] = {}
-
- newworkdict[term_id][version][0] = (0, li)
- print "a newie"
-
workdict = newworkdict
return result
@@ -174,10 +176,9 @@
('farka', 100, 1, 1, 1, 1), ('cica', 405, 1, 0, 1, 11)]
hm.fill(lst)
- print hm.hits
+# print hm.hits
# tox = ['cica', 'rugja', 'mar', 'meg']
# tox = ['cica', 'rugja', 'meg', 'rugja']
tox = ['cica', 'cica', 'farka', 'cica', 'cica', 'farka']
- print "lento",len(tox)
rr = hm.lookup(tox)
print "result=", rr
Modified: trunk/misc/osiris_xml/ohanalyze.py
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py 2013-08-04 18:51:24 UTC (rev 891)
+++ trunk/misc/osiris_xml/ohanalyze.py 2013-08-04 20:14:44 UTC (rev 892)
@@ -85,7 +85,7 @@
result.append(sub)
return result
- def indexate(self, lst, norm):
+ def indexate(self, lst, norm, ind_type = 2):
"""Finds an ordered set of strings in a normalized string
Param: the list and the normalized string
Returns: a list of tuples
@@ -93,15 +93,18 @@
"""
mindex = 0
result = []
- for l in lst:
- ind = norm.find(l, mindex)
+ for li, le in enumerate(lst):
+ ind = norm.find(le, mindex)
if mindex == -1:
raise 'Problem: ' + norm
#convert numbers!
# if l.isdigit():
# t = "_NUMBER_", ind
# else:
- t = l, ind
+ if ind_type == 1:
+ t = le, ind
+ else:
+ t = le, li
result.append(t)
mindex = ind + 1;
@@ -398,7 +401,8 @@
# self.checkTags(l)
- def makeIncidences(self, where):
+ #for indexing_type use: 1 - char indices (in norm), else: token indices
+ def makeIncidences(self, where, indexing_type = 2):
if where == 0:
self.output("use dbdict", where)
self.output("drop table incidences", where)
@@ -418,7 +422,7 @@
counter += 1
news = self.tokenize(row[1])
idval = int(row[0])
- il = self.indexate(news, row[2])
+ il = self.indexate(news, row[2], indexing_type)
for ne, nind in il:
self.output("insert into incidences(term, dict_id, idx) values('" + \
ne + "', " + str(idval) + ", " + str(nind) + ")", where)
@@ -442,9 +446,9 @@
print "prl=", prl
print "\t" + "+".join(prl)
- def getFinalSplits(self, s, splits):
+ def getFinalSplits(self, word_id, s, splits, fmt):
result = []
- for spt in splits:
+ for spi, spt in enumerate(splits):
previndex = 0
prl = []
# print spt
@@ -453,7 +457,13 @@
prl.append(s[previndex:curindex])
# print "\t", previndex, curindex, s[previndex:curindex]
previndex = curindex
- result.append('+'.join(prl))
+ if fmt == 2:
+ result.append('+'.join(prl))
+ else:
+ ll = len(prl) - 1
+ for pi, pe in enumerate(prl):
+ result.append("insert into word_indices(lemma, word_id, version, word_index, islast) values ('{0}', {1}, {2}, {3}, {4});".format(pe, word_id, spi, pi, int(pi == ll)))
+
return result
@@ -478,7 +488,8 @@
# print "CARTESE:", result
return result
- def try2(self, where):
+ #format: 2: zsofi, 3: sql
+ def try2(self, fmt):
# query = "select id, actual, norm from ohdict where id > 1000 limit 500";
# query = "select id, actual, norm from ohdict where id > 3022 limit 2";
@@ -494,7 +505,7 @@
counter += 1
news = self.tokenize(row[1])
idval = int(row[0])
- il = self.indexate(news, row[2])
+ il = self.indexate(news, row[2], 2)
totsplits = []
for ne, nind in il:
termsplits = self.humorSplit(ne)
@@ -505,7 +516,7 @@
# print totsplits
crtl = self.cartese(totsplits)
# print crtl
- fin = self.getFinalSplits(row[2], crtl)
+ fin = self.getFinalSplits(row[0], row[2], crtl, fmt)
print row[0], row[2]
for qq in fin:
print "\t" + qq
@@ -521,9 +532,9 @@
dumpmsgs = False
oh = OHAnalyze(False)
-#oh.makeIncidences(1)
+#oh.makeIncidences(0, 1)
oh.fill()
-oh.try2(2)
+oh.try2(3)
#oh.humorSplit("agyagtalajhumorista", ll)
#print oh.getAllSplits("virslitad", 0)
#print "---"
More information about the Hejes-devel
mailing list