[Hejes-devel] [1029] sitemap.xml frissito kesz

hejes-devel at nytud.hu hejes-devel at nytud.hu
Mon Nov 18 13:13:02 CET 2013


Revision: 1029
Author:   mittelholcz
Date:     2013-11-18 13:13:02 +0100 (Mon, 18 Nov 2013)
Log Message:
-----------
sitemap.xml frissito kesz

Added Paths:
-----------
    trunk/misc/dbblog/sitemap_updater.py

Added: trunk/misc/dbblog/sitemap_updater.py
===================================================================
--- trunk/misc/dbblog/sitemap_updater.py	                        (rev 0)
+++ trunk/misc/dbblog/sitemap_updater.py	2013-11-18 12:13:02 UTC (rev 1029)
@@ -0,0 +1,150 @@
+#! /usr/bin/python
+# coding=utf8
+
+from urllib2 import urlopen
+import fileinput
+from datetime import datetime
+
+
+class List:
+  """ Wrapper class, compares lists by first elements
+  """
+  def __init__(self, first_data):
+    self.l = [ first_data ]
+
+  def __cmp__(self, other):
+    return cmp(self.l[0], other.l[0])
+
+  def __hash__(self):
+    return hash(self.l[0])
+
+  def __getitem__(self, i):
+    return self.l[i]
+
+  def __setitem__(self, i, v):
+    self.l[i] = v
+
+  def append(self, v):
+    self.l.append(v)
+
+
+class FileProperties:
+  """ 
+  """
+  def __init__(self, file_name, tags, url_pref):
+    self.file_name = file_name
+    self.tags  = tags
+    self.url_pref  = url_pref
+
+
+class Extractor:
+  """ Extract text between given tags from a string.
+      Returns a List of substrings.
+      If no tag in string, return List(None).
+  """
+  def __init__(self, file_prop):
+    self.f = file_prop
+
+  def __call__(self, line):
+    ret = List(self._extract(line, self.f.tags[0]))
+    if self.f.file_name.endswith('.rss'):
+      ret.append( self._extract(line, self.f.tags[1], True) )
+    return ret
+
+  def _extract(self, line, tag, date=False):
+    opening  = '<'  + tag + '>'
+    opening += self.f.url_pref if not date else ''
+    closing  = '</' + tag + '>'
+    start    = line.find(opening) + len(opening)
+    end      = line.find(closing)
+    ret = None
+    if start != -1 and end != -1:
+      ret = line[start:end]
+    return ret
+
+
+class Concatenator:
+  """ Tag strings and concatenate them.
+  """
+  def __init__(self, f):
+    prefix    = '<url>\n  <loc>' + f.url_pref
+    suffix    = '</lastmod>\n  <priority>0.50</priority>\n</url>\n'
+    self.item      = prefix + '{link}</loc>\n  <lastmod>{date}' + suffix
+
+  def __call__(self, link_date1, link_date2):
+    link_date2 = self.item.format( link=link_date2[0], date=Date.rss2w3c(link_date2[1]) )
+    return ( link_date1 + link_date2 )
+
+
+class Date:
+
+  months = {
+    'Jan':'01','Feb':'02','Mar':'03','Apr':'04','Mai':'05','Jun':'06',
+    'Jul':'07','Aug':'08','Sep':'09','Oct':'10','Nov':'11','Dec':'12' }
+
+  @staticmethod
+  def rss2w3c(rss_date):
+    """ Convert string in rss date format to w3c date format.
+        rss: Fri, 08 Nov 2013 12:29:17 +0100
+        w3c: 2013-11-08T12:29:17+01:00
+    """
+    rss_date = rss_date.split(' ')
+    YYYY = rss_date[3]
+    MM   = Date.months[ rss_date[2] ]
+    DD   = rss_date[1]
+    time = rss_date[4]
+    TZD  = '+01:00'
+    return YYYY+'-'+MM+'-'+DD+'T'+time+TZD
+
+  @staticmethod
+  def now_w3c():
+    """ Returns current date in w3c date format.
+    """
+    dt = str(datetime.now()).split(' ')
+    return dt[0]+'T'+dt[1].split('.')[0]+'+01:00'
+
+
+def insert(file_name, text):
+  """ Insert data into sitemap.xml.
+  """
+  bi = False
+  for i in fileinput.input(file_name, inplace=1):
+    if bi:
+      bi = False
+      d = '  <lastmod>' + str(Date.now_w3c()) + '</lastmod>\n'
+      i = d if i.startswith('  <lastmod>') else d + i
+    elif i.startswith('</urlset>'): print text,
+    print i.rstrip()
+    if i.endswith('helyesiras/blog/index</loc>\n'): bi = True
+
+
+def main():
+  r = FileProperties(
+      'http://htp-devel.nytud.hu/helyesiras_webdev/blog/feed.rss',
+      ['link', 'pubDate'],
+      'http://htp-devel.nytud.hu/helyesiras_webdev')
+  s = FileProperties(
+      './sitemap.xml', #TODO: Change this!
+      ['loc'],
+      'http://helyesiras.mta.hu/helyesiras')
+
+  rss = urlopen(r.file_name).readlines()
+  rss = set(map(Extractor(r), rss)) - set([ List(None) ])
+
+  sitemap = open(s.file_name, 'r').readlines()
+  sitemap = set(map(Extractor(s), sitemap)) - set([ List(None) ])
+
+  ins = rss-sitemap
+  if ins: insert(s.file_name, reduce(Concatenator(s), ins, ''))
+
+
+
+if __name__ == '__main__':
+  main()
+
+#TODO: backup file from old sitemap.xml
+
+  #rss = urllib2.urlopen('http://htp-devel.nytud.hu/helyesiras_webdev/blog/feed.rss').readlines()
+
+# vim: tabstop=2 shiftwidth=2 expandtab
+


Property changes on: trunk/misc/dbblog/sitemap_updater.py
___________________________________________________________________
Added: svn:executable
   + *




More information about the Hejes-devel mailing list