|
@ -7,11 +7,11 @@ from datetime import datetime |
|
|
from xml.etree import cElementTree as ET |
|
|
from xml.etree import cElementTree as ET |
|
|
from urllib import urlencode |
|
|
from urllib import urlencode |
|
|
from urllib2 import urlopen, HTTPError, URLError |
|
|
from urllib2 import urlopen, HTTPError, URLError |
|
|
# dependencies |
|
|
|
|
|
import html5lib, jinja2 |
|
|
|
|
|
# local mods |
|
|
# local mods |
|
|
from trim import trim_removed_spans, contents, set_text_contents, text_contents |
|
|
from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents |
|
|
from linkify import linkify, urlify, filename_to_padid |
|
|
from linkify import linkify, urlify, filename_to_padid |
|
|
|
|
|
# external dependencies (use pip to install these) |
|
|
|
|
|
import html5lib, jinja2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_template_env (tpath=None): |
|
|
def get_template_env (tpath=None): |
|
@ -100,7 +100,6 @@ while len(todo) > 0: |
|
|
os.makedirs(args.path) |
|
|
os.makedirs(args.path) |
|
|
except OSError: |
|
|
except OSError: |
|
|
pass |
|
|
pass |
|
|
# print ("{0}".format(padid).encode("utf-8"), file=sys.stderr) |
|
|
|
|
|
|
|
|
|
|
|
# _ |
|
|
# _ |
|
|
# _ __ ___ ___| |_ __ _ |
|
|
# _ __ ___ ___| |_ __ _ |
|
@ -184,6 +183,9 @@ while len(todo) > 0: |
|
|
if args.spider: |
|
|
if args.spider: |
|
|
for l in links: |
|
|
for l in links: |
|
|
if l not in todo and l not in done: |
|
|
if l not in todo and l not in done: |
|
|
|
|
|
if l.startswith("http://") or l.startswith("https://"): |
|
|
|
|
|
print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr) |
|
|
|
|
|
continue |
|
|
# if args.verbose: |
|
|
# if args.verbose: |
|
|
# print (" link: {0}".format(l), file=sys.stderr) |
|
|
# print (" link: {0}".format(l), file=sys.stderr) |
|
|
todo.append(l) |
|
|
todo.append(l) |
|
@ -237,8 +239,6 @@ while len(todo) > 0: |
|
|
a.remove(br); a.insert(1, br) |
|
|
a.remove(br); a.insert(1, br) |
|
|
img.attrib['src'] = href |
|
|
img.attrib['src'] = href |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# extract the style tag (with authorship colors) |
|
|
# extract the style tag (with authorship colors) |
|
|
style = t.find(".//style") |
|
|
style = t.find(".//style") |
|
|
if style != None: |
|
|
if style != None: |
|
|