Load raw html in articles and fix a bug in createRecents

This commit is contained in:
Grégory Soutadé 2012-08-15 18:11:55 +02:00
parent af3c792450
commit a5c5e7edc8
5 changed files with 48 additions and 24 deletions

View File

@ -37,7 +37,7 @@ class Archive(Index):
#print 'Generate ' + filename #print 'Generate ' + filename
nodes = dom.getElementsByTagName("*") nodes = dom.getElementsByTagName("*")
nodes[0] = self.parse(src, hooks, articles, dom, nodes[0]) nodes[0] = self.parse(src, hooks, articles, dom, nodes[0])
self.writeIfNotTheSame(output + self.dirname + '/' + filename, nodes[0].toxml('utf8')) self.writeIfNotTheSame(output + self.dirname + '/' + filename, nodes[0])
self.cur_page = self.cur_page + 1 self.cur_page = self.cur_page + 1
filename = self.filename + str(self.cur_page) + '.html' filename = self.filename + str(self.cur_page) + '.html'
dom = parse(src + '/_archive.html') dom = parse(src + '/_archive.html')

View File

@ -41,7 +41,7 @@ class Article(Index):
if not os.path.exists(filename): if not os.path.exists(filename):
os.makedirs(filename) os.makedirs(filename)
filename = filename + article.title_slug + '.html' filename = filename + article.title_slug + '.html'
self.writeIfNotTheSame(filename, nodes[0].toxml('utf8')) self.writeIfNotTheSame(filename, nodes[0])
dom = parse(src + '/_article.html') dom = parse(src + '/_article.html')
if not self.somethingWrote: if not self.somethingWrote:

View File

@ -73,7 +73,7 @@ class Category(Index):
#print 'Generate ' + filename #print 'Generate ' + filename
nodes = dom.getElementsByTagName("*") nodes = dom.getElementsByTagName("*")
nodes[0] = self.parse(src, hooks, articles, dom, nodes[0]) nodes[0] = self.parse(src, hooks, articles, dom, nodes[0])
self.writeIfNotTheSame(output + self.dirname + '/' + filename, nodes[0].toxml('utf8')) self.writeIfNotTheSame(output + self.dirname + '/' + filename, nodes[0])
self.cur_page = self.cur_page + 1 self.cur_page = self.cur_page + 1
filename = self.filename + str(self.cur_page) + '.html' filename = self.filename + str(self.cur_page) + '.html'
dom = parse(src + '/_category.html') dom = parse(src + '/_category.html')

View File

@ -2,10 +2,42 @@ import os
import hashlib import hashlib
import gzip import gzip
import math import math
import codecs
from xml.dom import * from xml.dom import *
from xml.dom.minidom import parse from xml.dom.minidom import parse
from xml.parsers.expat import * from xml.parsers.expat import *
class StrictUTF8Writer(codecs.StreamWriter):
'''A StreamWriter for utf8 that requires written objects be unicode'''
encode = codecs.utf_8_encode
value = ''
def __init__(self):
self.value = u''
pass
def write(self, object):
object = object.replace('&lt;', '<')
object = object.replace('&gt;', '>')
object = object.replace('&quot;', '"')
object = object.replace('&apos;', "'")
if not type(object) == unicode:
self.value = self.value + unicode(object, "utf-8")
else:
self.value = self.value + object
return self.value
def reset(self):
self.value = u''
def getvalue(self):
return self.value
#self.stream.write(object)
# if not isinstance(object, unicode):
# raise ValueError('write() requires unicode object')
# return codecs.StreamWriter.write(self, object)
class DynastieGenerator: class DynastieGenerator:
URI = "http://indefero.soutade.fr/p/dynastie" URI = "http://indefero.soutade.fr/p/dynastie"
@ -38,14 +70,10 @@ class DynastieGenerator:
res = math.ceil((nb_articles*1.0)/(nb_articles_per_page*1.0)) res = math.ceil((nb_articles*1.0)/(nb_articles_per_page*1.0))
return int(res) return int(res)
def removeCDATA(self, content): def writeIfNotTheSame(self, filename, node):
content = content.replace('<pre><![CDATA[', '<pre>') writer = StrictUTF8Writer()
content = content.replace(']]></pre>', '</pre>') node.writexml(writer)
content = writer.getvalue().encode('utf-8')
return content
def writeIfNotTheSame(self, filename, content):
content = self.removeCDATA(content)
if os.path.exists(filename): if os.path.exists(filename):
src_md5 = hashlib.md5() src_md5 = hashlib.md5()
f = open(filename,'rb') f = open(filename,'rb')

View File

@ -77,16 +77,8 @@ class Index(DynastieGenerator):
return return
f = open(filename, 'rb') f = open(filename, 'rb')
content = '<div id="123">' + f.read() + '</div>' article_content = f.read()
f.close() f.close()
dom2 = None
try:
dom2 = parseString(content)
except ExpatError, e:
self.addError('Error parsing ' + filename)
print filename
print e
pass
self.simpleTransform(values, dom, article_elem, root) self.simpleTransform(values, dom, article_elem, root)
@ -96,9 +88,9 @@ class Index(DynastieGenerator):
the_class = content_node.getAttribute('class') the_class = content_node.getAttribute('class')
if not the_class in post_transform: if not the_class in post_transform:
continue continue
if the_class == 'article_content' and dom2 != None: if the_class == 'article_content':
for article_node in dom2.firstChild.childNodes: new_node = dom.createTextNode(article_content)
content_node.appendChild(article_node) content_node.appendChild(new_node)
def createArticles(self, articles, dom, root, node): def createArticles(self, articles, dom, root, node):
articles_elem = self.createElement(dom, 'articles') articles_elem = self.createElement(dom, 'articles')
@ -116,6 +108,10 @@ class Index(DynastieGenerator):
root.replaceChild(articles_elem, node) root.replaceChild(articles_elem, node)
def createRecents(self, articles, dom, root, node): def createRecents(self, articles, dom, root, node):
if self.cur_article == len(articles):
root.removeChild(node)
return
if node.hasAttribute("limit"): if node.hasAttribute("limit"):
nb_recents = int(node.getAttribute("limit")) nb_recents = int(node.getAttribute("limit"))
else: else:
@ -179,7 +175,7 @@ class Index(DynastieGenerator):
#print 'Generate ' + filename #print 'Generate ' + filename
nodes = dom.getElementsByTagName("*") nodes = dom.getElementsByTagName("*")
nodes[0] = self.parse(src, hooks, articles, dom, nodes[0]) nodes[0] = self.parse(src, hooks, articles, dom, nodes[0])
self.writeIfNotTheSame(output + '/' + filename, nodes[0].toxml(encoding='utf-8')) self.writeIfNotTheSame(output + '/' + filename, nodes[0])
self.cur_page = self.cur_page + 1 self.cur_page = self.cur_page + 1
filename = 'index' + str(self.cur_page) + '.html' filename = 'index' + str(self.cur_page) + '.html'
dom = parse(src + '/_index.html') dom = parse(src + '/_index.html')