[Hejes-devel] [915] 2bsure
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Thu Aug 29 07:32:16 CEST 2013
Revision: 915
Author: hussami
Date: 2013-08-29 07:32:16 +0200 (Thu, 29 Aug 2013)
Log Message:
-----------
2bsure
Modified Paths:
--------------
trunk/misc/osiris_xml/ohanalyze.py
Modified: trunk/misc/osiris_xml/ohanalyze.py
===================================================================
--- trunk/misc/osiris_xml/ohanalyze.py 2013-08-28 16:36:39 UTC (rev 914)
+++ trunk/misc/osiris_xml/ohanalyze.py 2013-08-29 05:32:16 UTC (rev 915)
@@ -3,6 +3,7 @@
# coding: utf8
import sys
+import time
import copy
import re
sys.path.append("../../web2py/applications/helyesiras_webdev/modules")
@@ -51,6 +52,7 @@
# self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
self.splittags_ext = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3, '_IKEP': 4}
+ self.naughtychars = ' |;|-|/' #the separators we'll observe
if fill:
self.fill(self.db)
@@ -76,7 +78,14 @@
Param: the target string
Returns: the result list
"""
- l = re.split(' |;|-', s)
+# l = re.split(' |;|-|/', s)
+ l = re.split(self.naughtychars, s)
+ l2 = []
+ for le in l:
+ if le:
+ l2.append(le)
+ l = l2
+# print l
result = []
for sub in l:
# if sub.isdigit():
@@ -111,6 +120,7 @@
# print "indexate result=", result
return result
+
def string2intlist(self, s):
spl = s.split(',')
lst=[]
@@ -252,7 +262,8 @@
2. for all analyses: correct them
"""
if len(s) == 0:
- print "EXMPY"
+# dct[startind] = {len(s) : 1}
+ print "EXMPY", dct
return
# print " Start with", s, startind, "on level", level
hl = self.humorize(s)
@@ -281,6 +292,7 @@
def analyzeRecursion(self, s, dct, ind, lst, result):
l = len(s)
# print "anal:", ind, l
+# print dct
if ind not in dct:
return
# print dct[ind]
@@ -337,8 +349,11 @@
result.append(lst)
def addAnalysis(self, s, morphlist, result):
+# print [y.lex for y in morphlist]
relevant = self.correctHumorSplit2(\
[(y.lex, y.tag, len(y.lex)) for y in morphlist])
+ if not relevant:
+ return 0
t1,t2,t3 = relevant[-1]
if t2 == 'NOM' or t2 == 'e3':
@@ -491,21 +506,30 @@
#format: 2: zsofi, 3: sql
def try2(self, fmt):
+ if fmt == 3:
+ print "drop table word_indices;"
+ print "create table word_indices(lemma varchar(100), word_id integer, version integer, word_index integer, islast integer);"
+ print "create index wordindices_index on word_indices(lemma);"
+
# query = "select id, actual, norm from ohdict where id > 1000 limit 500";
# query = "select id, actual, norm from ohdict where id > 3022 limit 2";
# query = "select id, actual, norm from ohdict order by id asc";
+# query = "select id, actual, norm from ohdict where id > 31340 and id not in (10954, 10962, 26766, 28090, 31341, 41501, 41502, 41503, 72479, 72480, 72481, 74282, 72483, 72484) order by id asc";
+# query = "select id, actual, norm from ohdict where id=11405"
query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
+# query = "select id, actual, norm from ohdict where id > 0 and id not in (253, 1617, 10954, 10962, 14118, 21042, 26766, 28090, 31341, 35355, 41501, 41502, 41503, 53195, 63253, 66216, 66217, 68562, 69228, 69229, 72479, 72480, 72481, 72482, 72483, 72484, 72565, 79810, 79812, 81608, 81609, 82178, 82181, 104301, 107148, 110649, 112890) order by id asc";
#107148: recursion depth!
self.db.execute(query)
results = self.db.fetchall()
counter = 0
- for row in results:
+ for ri, row in enumerate(results):
counter += 1
news = self.tokenize(row[1])
idval = int(row[0])
il = self.indexate(news, row[2], 2)
+ filterednorm = re.sub(self.naughtychars, '', row[2])
totsplits = []
for ne, nind in il:
termsplits = self.humorSplit(ne)
@@ -516,10 +540,13 @@
# print totsplits
crtl = self.cartese(totsplits)
# print crtl
- fin = self.getFinalSplits(row[0], row[2], crtl, fmt)
- print row[0], row[2]
+ fin = self.getFinalSplits(row[0], filterednorm, crtl, fmt)
+ if fmt == 2:
+ print row[0], filterednorm
for qq in fin:
print "\t" + qq
+ if ri % 500 == 0:
+ time.sleep(1)
if len(sys.argv) < 2:
More information about the Hejes-devel
mailing list