[Hejes-devel] [1060] datumok frissitese
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Wed Dec 18 11:59:12 CET 2013
Revision: 1060
Author: mittelholcz
Date: 2013-12-18 11:59:12 +0100 (Wed, 18 Dec 2013)
Log Message:
-----------
datumok frissitese
Modified Paths:
--------------
trunk/misc/dbblog/sitemap_updater.py
Modified: trunk/misc/dbblog/sitemap_updater.py
===================================================================
--- trunk/misc/dbblog/sitemap_updater.py 2013-12-17 11:37:56 UTC (rev 1059)
+++ trunk/misc/dbblog/sitemap_updater.py 2013-12-18 10:59:12 UTC (rev 1060)
@@ -8,73 +8,50 @@
from email.Utils import formatdate
-class List:
- """ Wrapper class, compares lists by first elements
- """
- def __init__(self, first_data):
- self.l = [ first_data ]
-
- def __cmp__(self, other):
- return cmp(self.l[0], other.l[0])
-
- def __hash__(self):
- return hash(self.l[0])
-
- def __getitem__(self, i):
- return self.l[i]
-
- def __setitem__(self, i, v):
- self.l[i] = v
-
- def append(self, v):
- self.l.append(v)
-
-
class FileProperties:
- """
+ """ Data storing about file
"""
- def __init__(self, file_name, tags, url_pref):
+ def __init__(self, file_name, link_tag, date_tag, url):
self.file_name = file_name
- self.tags = tags
- self.url_pref = url_pref
+ self.link_tag = link_tag
+ self.date_tag = date_tag
+ self.url = url
class Extractor:
- """ Extracts text between given tags from a string.
- Returns a List of substrings.
- If no tag in string, return List(None).
+ """ Extracts content between given tags from a string.
+ Returns a tuple of contents.
"""
def __init__(self, file_prop):
self.f = file_prop
- def __call__(self, line):
- ret = List(self._extract(line, self.f.tags[0]))
- if self.f.file_name.endswith('.rss'):
- ret.append( self._extract(line, self.f.tags[1], True) )
- return ret
-
def _extract(self, line, tag, date=False):
opening = '<' + tag + '>'
- opening += self.f.url_pref if not date else ''
+ opening += self.f.url if not date else '' # cut url prefix
closing = '</' + tag + '>'
- start = line.find(opening) + len(opening)
+ start = line.find(opening)
end = line.find(closing)
ret = None
if start != -1 and end != -1:
- ret = line[start:end]
+ ret = line[start+len(opening):end]
return ret
+ def __call__(self, line):
+ link = self._extract(line, self.f.link_tag)
+ date = self._extract(line, self.f.date_tag, True)
+ date = Date.rss2w3c(date) if date and self.f.file_name.endswith('.rss') else date # convert rss date to w3c
+ return (link, date)
class Concatenator:
- """ Tag strings and concatenate them.
+ """ Tags strings and concatenate them.
"""
def __init__(self, f):
- prefix = '<url>\n <loc>' + f.url_pref
+ prefix = '<url>\n <loc>' + f.url
suffix = '</lastmod>\n <priority>0.60</priority>\n</url>\n'
self.item = prefix + '{link}</loc>\n <lastmod>{date}' + suffix
def __call__(self, link_date1, link_date2):
- link_date2 = self.item.format( link=link_date2[0], date=Date.rss2w3c(link_date2[1]) )
+ link_date2 = self.item.format( link=link_date2[0], date=link_date2[1] )
return ( link_date1 + link_date2 )
@@ -105,41 +82,55 @@
return Date.rss2w3c(formatdate(mktime(datetime.now().timetuple()), localtime=True))
-def insert(file_name, text):
+def insert(file_prop, new, update):
""" Insert data into sitemap.xml.
"""
- bi = False
- for i in fileinput.input(file_name, inplace=1):
- if bi:
+ bi = False # blog/index
+ up = False # update
+ e = Extractor(file_prop)
+ date_line = ''
+ for i in fileinput.input(file_prop.file_name, inplace=1):
+ line = e(i)[0]
+ if i.startswith('</urlset>'): print new,
+ elif line == 'index':
+ bi = True
+ date_line = ' <lastmod>' + str(Date.now_w3c()) + '</lastmod>\n'
+ elif line in update:
+ up = True
+ date_line = ' <lastmod>' + update[line] + '</lastmod>\n'
+ elif bi:
bi = False
- d = ' <lastmod>' + str(Date.now_w3c()) + '</lastmod>\n' # + ' <changefreq>weekly</changefreq>\n'
- i = d if i.startswith(' <lastmod>') else d + i
- elif i.startswith('</urlset>'): print text,
+ i = date_line if '<lastmod>' in i else date_line + i
+ elif up:
+ up = False
+ i = date_line if '<lastmod>' in i else date_line + i
print i.rstrip()
- if i.endswith('helyesiras/blog/index</loc>\n'): bi = True
def main():
r = FileProperties(
'http://htp-devel.nytud.hu/helyesiras_webdev/blog/feed.rss',
- ['link', 'pubDate'],
- 'http://htp-devel.nytud.hu/helyesiras_webdev')
+ 'link', 'pubDate',
+ 'http://htp-devel.nytud.hu/helyesiras_webdev/blog/')
+# s = FileProperties('./sitemap.xml', 'loc', 'lastmod', 'http://helyesiras.mta.hu/helyesiras/blog/') # test
s = FileProperties(
'../../web2py/applications/helyesiras_webdev/static/sitemap.xml',
- ['loc'],
+ 'loc', 'lastmod',
'http://helyesiras.mta.hu/helyesiras')
rss = urlopen(r.file_name).readlines()
- rss = set(map(Extractor(r), rss)) - set([ List(None) ])
+ rss = dict(map(Extractor(r), rss))
+ rss.pop(None, 0)
- sitemap = open(s.file_name, 'r').readlines()
- sitemap = set(map(Extractor(s), sitemap)) - set([ List(None) ])
+ sitemap = open(s.file_name, 'r').read().split('</url>')
+ sitemap = dict(map(Extractor(s), sitemap))
+ sitemap.pop(None, 0)
- ins = rss-sitemap
- if ins: insert(s.file_name, reduce(Concatenator(s), ins, ''))
+ update = dict([(x, rss[x]) for x in sitemap if (x in rss and sitemap[x]!=rss[x])])
+ new = [(x, rss[x]) for x in set(rss)-set(sitemap)]
+ if new or update: insert(s, reduce(Concatenator(s), new, ''), update)
-
if __name__ == '__main__':
main()
More information about the Hejes-devel
mailing list