diff --git a/generators/archive.py b/generators/archive.py index 6f65cbe..5b8d3b0 100644 --- a/generators/archive.py +++ b/generators/archive.py @@ -37,7 +37,7 @@ class Archive(Index): #print 'Generate ' + filename nodes = dom.getElementsByTagName("*") nodes[0] = self.parse(src, hooks, articles, dom, nodes[0]) - self.writeIfNotTheSame(output + self.dirname + '/' + filename, nodes[0].toxml('utf8')) + self.writeIfNotTheSame(output + self.dirname + '/' + filename, nodes[0]) self.cur_page = self.cur_page + 1 filename = self.filename + str(self.cur_page) + '.html' dom = parse(src + '/_archive.html') diff --git a/generators/article.py b/generators/article.py index dd226d7..1eb59d3 100644 --- a/generators/article.py +++ b/generators/article.py @@ -41,7 +41,7 @@ class Article(Index): if not os.path.exists(filename): os.makedirs(filename) filename = filename + article.title_slug + '.html' - self.writeIfNotTheSame(filename, nodes[0].toxml('utf8')) + self.writeIfNotTheSame(filename, nodes[0]) dom = parse(src + '/_article.html') if not self.somethingWrote: diff --git a/generators/category.py b/generators/category.py index a7e25c0..cd61bb6 100644 --- a/generators/category.py +++ b/generators/category.py @@ -73,7 +73,7 @@ class Category(Index): #print 'Generate ' + filename nodes = dom.getElementsByTagName("*") nodes[0] = self.parse(src, hooks, articles, dom, nodes[0]) - self.writeIfNotTheSame(output + self.dirname + '/' + filename, nodes[0].toxml('utf8')) + self.writeIfNotTheSame(output + self.dirname + '/' + filename, nodes[0]) self.cur_page = self.cur_page + 1 filename = self.filename + str(self.cur_page) + '.html' dom = parse(src + '/_category.html') diff --git a/generators/generator.py b/generators/generator.py index 65da4b7..cc4d91b 100644 --- a/generators/generator.py +++ b/generators/generator.py @@ -2,10 +2,42 @@ import os import hashlib import gzip import math +import codecs from xml.dom import * from xml.dom.minidom import parse from xml.parsers.expat import * +class StrictUTF8Writer(codecs.StreamWriter): + '''A StreamWriter for utf8 that requires written objects be unicode''' + encode = codecs.utf_8_encode + value = '' + + def __init__(self): + self.value = u'' + pass + + def write(self, object): + object = object.replace('<', '<') + object = object.replace('>', '>') + object = object.replace('"', '"') + object = object.replace(''', "'") + + if not type(object) == unicode: + self.value = self.value + unicode(object, "utf-8") + else: + self.value = self.value + object + return self.value + + def reset(self): + self.value = u'' + + def getvalue(self): + return self.value + #self.stream.write(object) + # if not isinstance(object, unicode): + # raise ValueError('write() requires unicode object') + # return codecs.StreamWriter.write(self, object) + class DynastieGenerator: URI = "http://indefero.soutade.fr/p/dynastie" @@ -38,14 +70,10 @@ class DynastieGenerator: res = math.ceil((nb_articles*1.0)/(nb_articles_per_page*1.0)) return int(res) - def removeCDATA(self, content): - content = content.replace('
') - content = content.replace(']]>', '') - - return content - - def writeIfNotTheSame(self, filename, content): - content = self.removeCDATA(content) + def writeIfNotTheSame(self, filename, node): + writer = StrictUTF8Writer() + node.writexml(writer) + content = writer.getvalue().encode('utf-8') if os.path.exists(filename): src_md5 = hashlib.md5() f = open(filename,'rb') diff --git a/generators/index.py b/generators/index.py index 52a5058..77ad2b5 100644 --- a/generators/index.py +++ b/generators/index.py @@ -77,16 +77,8 @@ class Index(DynastieGenerator): return f = open(filename, 'rb') - content = '