[Hejes-devel] [1029] sitemap.xml frissito kesz
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Mon Nov 18 13:13:02 CET 2013
Revision: 1029
Author: mittelholcz
Date: 2013-11-18 13:13:02 +0100 (Mon, 18 Nov 2013)
Log Message:
-----------
sitemap.xml frissito kesz
Added Paths:
-----------
trunk/misc/dbblog/sitemap_updater.py
Added: trunk/misc/dbblog/sitemap_updater.py
===================================================================
--- trunk/misc/dbblog/sitemap_updater.py (rev 0)
+++ trunk/misc/dbblog/sitemap_updater.py 2013-11-18 12:13:02 UTC (rev 1029)
@@ -0,0 +1,150 @@
+#! /usr/bin/python
+# coding=utf8
+
+from urllib2 import urlopen
+import fileinput
+from datetime import datetime
+
+
+class List:
+ """ Wrapper class, compares lists by first elements
+ """
+ def __init__(self, first_data):
+ self.l = [ first_data ]
+
+ def __cmp__(self, other):
+ return cmp(self.l[0], other.l[0])
+
+ def __hash__(self):
+ return hash(self.l[0])
+
+ def __getitem__(self, i):
+ return self.l[i]
+
+ def __setitem__(self, i, v):
+ self.l[i] = v
+
+ def append(self, v):
+ self.l.append(v)
+
+
+class FileProperties:
+ """
+ """
+ def __init__(self, file_name, tags, url_pref):
+ self.file_name = file_name
+ self.tags = tags
+ self.url_pref = url_pref
+
+
+class Extractor:
+ """ Extract text between given tags from a string.
+ Returns a List of substrings.
+ If no tag in string, return List(None).
+ """
+ def __init__(self, file_prop):
+ self.f = file_prop
+
+ def __call__(self, line):
+ ret = List(self._extract(line, self.f.tags[0]))
+ if self.f.file_name.endswith('.rss'):
+ ret.append( self._extract(line, self.f.tags[1], True) )
+ return ret
+
+ def _extract(self, line, tag, date=False):
+ opening = '<' + tag + '>'
+ opening += self.f.url_pref if not date else ''
+ closing = '</' + tag + '>'
+ start = line.find(opening) + len(opening)
+ end = line.find(closing)
+ ret = None
+ if start != -1 and end != -1:
+ ret = line[start:end]
+ return ret
+
+
+class Concatenator:
+ """ Tag strings and concatenate them.
+ """
+ def __init__(self, f):
+ prefix = '<url>\n <loc>' + f.url_pref
+ suffix = '</lastmod>\n <priority>0.50</priority>\n</url>\n'
+ self.item = prefix + '{link}</loc>\n <lastmod>{date}' + suffix
+
+ def __call__(self, link_date1, link_date2):
+ link_date2 = self.item.format( link=link_date2[0], date=Date.rss2w3c(link_date2[1]) )
+ return ( link_date1 + link_date2 )
+
+
+class Date:
+
+ months = {
+ 'Jan':'01','Feb':'02','Mar':'03','Apr':'04','Mai':'05','Jun':'06',
+ 'Jul':'07','Aug':'08','Sep':'09','Oct':'10','Nov':'11','Dec':'12' }
+
+ @staticmethod
+ def rss2w3c(rss_date):
+ """ Convert string in rss date format to w3c date format.
+ rss: Fri, 08 Nov 2013 12:29:17 +0100
+ w3c: 2013-11-08T12:29:17+01:00
+ """
+ rss_date = rss_date.split(' ')
+ YYYY = rss_date[3]
+ MM = Date.months[ rss_date[2] ]
+ DD = rss_date[1]
+ time = rss_date[4]
+ TZD = '+01:00'
+ return YYYY+'-'+MM+'-'+DD+'T'+time+TZD
+
+ @staticmethod
+ def now_w3c():
+ """ Returns current date in w3c date format.
+ """
+ dt = str(datetime.now()).split(' ')
+ return dt[0]+'T'+dt[1].split('.')[0]+'+01:00'
+
+
+def insert(file_name, text):
+ """ Insert data into sitemap.xml.
+ """
+ bi = False
+ for i in fileinput.input(file_name, inplace=1):
+ if bi:
+ bi = False
+ d = ' <lastmod>' + str(Date.now_w3c()) + '</lastmod>\n'
+ i = d if i.startswith(' <lastmod>') else d + i
+ elif i.startswith('</urlset>'): print text,
+ print i.rstrip()
+ if i.endswith('helyesiras/blog/index</loc>\n'): bi = True
+
+
+def main():
+ r = FileProperties(
+ 'http://htp-devel.nytud.hu/helyesiras_webdev/blog/feed.rss',
+ ['link', 'pubDate'],
+ 'http://htp-devel.nytud.hu/helyesiras_webdev')
+ s = FileProperties(
+ './sitemap.xml', #TODO: Change this!
+ ['loc'],
+ 'http://helyesiras.mta.hu/helyesiras')
+
+ rss = urlopen(r.file_name).readlines()
+ rss = set(map(Extractor(r), rss)) - set([ List(None) ])
+
+ sitemap = open(s.file_name, 'r').readlines()
+ sitemap = set(map(Extractor(s), sitemap)) - set([ List(None) ])
+
+ ins = rss-sitemap
+ if ins: insert(s.file_name, reduce(Concatenator(s), ins, ''))
+
+
+
+if __name__ == '__main__':
+ main()
+
+#TODO: backup file from old sitemap.xml
+
+ #rss = urllib2.urlopen('http://htp-devel.nytud.hu/helyesiras_webdev/blog/feed.rss').readlines()
+
+# vim: tabstop=2 shiftwidth=2 expandtab
+
Property changes on: trunk/misc/dbblog/sitemap_updater.py
___________________________________________________________________
Added: svn:executable
+ *
More information about the Hejes-devel
mailing list