From d3732a1aeefe3af9ffb4280dc50547befd42b065 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Tue, 25 Aug 2015 12:44:16 +0200 Subject: [PATCH] updated --- trim.py => et_helpers.py | 0 etherdump | 12 ++++++------ 2 files changed, 6 insertions(+), 6 deletions(-) rename trim.py => et_helpers.py (100%) diff --git a/trim.py b/et_helpers.py similarity index 100% rename from trim.py rename to et_helpers.py diff --git a/etherdump b/etherdump index a239b64..7133ce6 100755 --- a/etherdump +++ b/etherdump @@ -7,11 +7,11 @@ from datetime import datetime from xml.etree import cElementTree as ET from urllib import urlencode from urllib2 import urlopen, HTTPError, URLError -# dependencies -import html5lib, jinja2 # local mods -from trim import trim_removed_spans, contents, set_text_contents, text_contents +from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents from linkify import linkify, urlify, filename_to_padid +# external dependencies (use pip to install these) +import html5lib, jinja2 def get_template_env (tpath=None): @@ -100,7 +100,6 @@ while len(todo) > 0: os.makedirs(args.path) except OSError: pass - # print ("{0}".format(padid).encode("utf-8"), file=sys.stderr) # _ # _ __ ___ ___| |_ __ _ @@ -184,6 +183,9 @@ while len(todo) > 0: if args.spider: for l in links: if l not in todo and l not in done: + if l.startswith("http://") or l.startswith("https://"): + print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr) + continue # if args.verbose: # print (" link: {0}".format(l), file=sys.stderr) todo.append(l) @@ -237,8 +239,6 @@ while len(todo) > 0: a.remove(br); a.insert(1, br) img.attrib['src'] = href - - # extract the style tag (with authorship colors) style = t.find(".//style") if style != None: