@ -10,7 +10,8 @@ from time import sleep
from html5tidy import html5tidy
import html5lib
from xml . etree import ElementTree as ET
# debugging
# import ElementTree as ET
"""
pull ( meta ) :
@ -94,7 +95,7 @@ storing enough information to reconstruct (or understand an error occurred)
meta [ ' padid ' ] = padid . encode ( " utf-8 " )
versions = meta [ " versions " ] = [ ]
versions . append ( {
" url " : padurlbase + padid . encode ( " utf-8 " ) ,
" url " : padurlbase + quote ( padid . encode ( " utf-8 " ) ) , # this quote was really important for dealing with rogue chars like \xa0 in a padid;
" type " : " pad " ,
" code " : 200
} )
@ -174,11 +175,11 @@ storing enough information to reconstruct (or understand an error occurred)
html = html [ ' data ' ] [ ' html ' ]
ver [ " path " ] = p + " .diff.html "
ver [ " url " ] = quote ( ver [ " path " ] )
doc = html5lib . parse ( html , namespaceHTMLElements = False )
doc = html5lib . parse ( html . encode ( " utf-8 " ) , encoding = " utf-8 " , namespaceHTMLElements = False )
html5tidy ( doc , indent = True , title = padid , scripts = " ../versions.js " , links = links )
with open ( ver [ " path " ] , " w " ) as f :
# f.write(html.encode("utf-8"))
print ( ET . tostring ( doc , method = " html " , encoding = " unicode " ) . encode ( " utf-8 " ) , file = f )
print ( ET . tostring ( doc , method = " html " , encoding = " utf-8 " ) , file = f )
# Process text, html, dhtml, all options
if args . all or args . html :
@ -190,12 +191,11 @@ storing enough information to reconstruct (or understand an error occurred)
html = html [ ' data ' ] [ ' html ' ]
ver [ " path " ] = p + " .raw.html "
ver [ " url " ] = quote ( ver [ " path " ] )
doc = html5lib . parse ( html , namespaceHTMLElements = False )
html5tidy ( doc , indent = True , title = padid , scripts = " ../versions.js " , links = links )
with open ( ver [ " path " ] , " w " ) as f :
# f.write(html.encode("utf-8"))
print ( ET . tostring ( doc , method = " html " , encoding = " unicode " ) . encode ( " u tf-8 " ) , file = f )
print ( ET . tostring ( doc , method = " html " , encoding = " utf-8 " ) , file = f )
# output meta
if args . all or args . meta :