etherpump/etherdump/commands/dump.py

#!/usr/bin/env python
# License: AGPL
#
#
# todo:
# Capture exceptions... add HTTP status errors (502) to meta!!
# so that an eventual index can show the problematic pages!
# Also: provide links to text only / html versions when diff HTML fails

from __future__ import print_function
from etherdump import DATAPATH

# stdlib
import json, sys, os, re
from argparse import ArgumentParser
from datetime import datetime
from xml.etree import cElementTree as ET 
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from time import sleep

# external dependencies (use pip to install these)
import html5lib, jinja2


def filename_to_padid (t):
    t = t.replace("_", " ")
    t = re.sub(r"\.html$", "", t)
    return t

def normalize_pad_name (n):
    if '?' in n:
        n = n.split('?', 1)[0]
    if '/' in n:
        n = n.split('/', 1)[0]
    return n

def urlify (t, ext=".html"):
    return t.replace(" ", "_") + ext

def linkify (src, urlify=urlify):

    collect = []

    def s (m):
        contents = strip_tags(m.group(1))
        contents = normalize_pad_name(contents)
        collect.append(contents)
        link = urlify(contents)
        # link = link.split("?", 1)[0]
        return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)

    # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
    ## question marks are ignored by etherpad, so split/strip it
    ## strip slashes as well!! (/timeslider)
    src = re.sub(r"\[\[(.+?)\]\]", s, src)
    return (src, collect)

def strip_tags (text):
    return re.sub(r"<.*?>", "", text)

def set_text_contents (element, text):
    """ ok this isn't really general, but works for singly wrapped elements """
    while len(element) == 1:
        element = element[0]
    element.text = text

def text_contents (element):
    return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')

def contents (element, method="html"):
    return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])

def get_parent(tree, elt):
    for parent in tree.iter():
        for child in parent:
            if child == elt:
                return parent

def remove_recursive (tree, elt):
    """ Remove element and (any resulting) empty containing elements """
    p = get_parent(tree, elt)
    if p:
        p.remove(elt)
        if len(p) == 0 and (p.text == None or p.text.strip() == ""):
            # print ("empty parent", p, file=sys.stderr)
            remove_recursive(tree, p)


def trim_removed_spans (t):
    # remove <span class="removed"> and empty parents
    for n in t.findall(".//span[@class='removed']"):
       remove_recursive(t, n)
    # then strip any leading br's from body
    while True:
        tag = t.find("./body")[0]
        if tag.tag == "br":
            remove_recursive(t, tag)
        else:
            break

def get_template_env (tpath=None):
    paths = []
    if tpath and os.path.isdir(tpath):
        paths.append(tpath)
    # paths.append(TEMPLATES_PATH)
    loader = jinja2.FileSystemLoader(paths)
    env = jinja2.Environment(loader=loader)
    return env

def get_group_info(gid, info):
    if 'groups' in info:
        if gid in info['groups']:
            return info['groups'][gid]

def main(args):
    p = ArgumentParser("""
          _   _                  _                       
      ___| |_| |__   ___ _ __ __| |_   _ _ __ ___  _ __  
     / _ \ __| '_ \ / _ \ '__/ _` | | | | '_ ` _ \| '_ \ 
    |  __/ |_| | | |  __/ | | (_| | |_| | | | | | | |_) |
     \___|\__|_| |_|\___|_|  \__,_|\__,_|_| |_| |_| .__/ 
                                                  |_|    
""")
    p.add_argument("padid", default=[], nargs="*", help="the padid(s) to process")
    p.add_argument("--padinfo", default="padinfo.json", help="JSON file with login data for the pad (url, apikey etc), default: padinfo.json")
    p.add_argument("--path", default="output", help="path to save files, default: output")
    p.add_argument("--verbose", default=False, action="store_true", help="flag for verbose output")
    p.add_argument("--limit", type=int, default=None)
    p.add_argument("--allpads", default=False, action="store_true", help="flag to process all pads")
    p.add_argument("--templatepath", default=os.path.join(DATAPATH, "templates"), help="directory with templates (override default files)")
    p.add_argument("--colors-template", default="pad_colors.html", help="pad with authorship colors template name: pad_colors.html")
    p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
    p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
    p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
    p.add_argument("--showurls", default=False, action="store_true", help="flag to display API URLs that are used (to stderr)")
    p.add_argument("--hidepaths", default=False, action="store_true", help="flag to not display paths")
    p.add_argument("--pretend", default=False, action="store_true", help="flag to not actually save")
    p.add_argument("--linkify", default=False, action="store_true", help="flag to process [[link]] forms (and follow when --spider is used)")
    p.add_argument("--spider", default=False, action="store_true", help="flag to spider pads (requires --linkify)")
    p.add_argument("--add-images", default=False, action="store_true", help="flag to add image tags")
    p.add_argument("--force", default=False, action="store_true", help="force dump (even if not updated since last dump)")
    p.add_argument("--authors-css", default=None, help="filename to save collected authorship css (nb: any existing file will be mercilessly overwritten), default: don't accumulate css")

    # TODO css from pad --- ie specify a padid for a stylesheet!!!!!!
    # p.add_argument("--css", default="styles.css", help="padid of stylesheet")

    args = p.parse_args(args)
    with open(args.padinfo) as f:
        info = json.load(f)

    apiurl =  "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)

    # padlinkpats are for mapping internal pad links
    # linkpats are any other link replacements, both are regexps

    padlinkpats = []
    linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
    linkpats.extend(zip(args.linksearch, args.linkreplace))
    if "padlink" in info:
        if type(info['padlink']) == list:
            padlinkpats.extend(info['padlink'])
        else:
            padlinkpats.append(info['padlink'])
    padlinkpats.extend(args.padlink)

    env = get_template_env(args.templatepath)
    colors_template = env.get_template(args.colors_template)

    todo = args.padid
    done = set()
    count = 0
    data = {}
    authors_css_rules = {}
    data['apikey'] = info['apikey']

    if args.allpads:
        # push the list of all pad names on to todo
        list_url = apiurl+'listAllPads?'+urlencode(data)
        if args.showurls:
            print (list_url, file=sys.stderr)
        results = json.load(urlopen(list_url))['data']['padIDs']
        todo.extend(results)

    while len(todo) > 0:
        padid = todo[0]
        todo = todo[1:]
        done.add(padid)

        data['padID'] = padid.encode("utf-8")

        if args.verbose:
            print (u"PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)

        # g.yIRLMysh0PMsCMHc$
        grouppat = re.compile(ur"^g\.(\w+)\$(.+)$")
        m = grouppat.search(padid)
        if m:
            group = m.group(1)
            ginfo = get_group_info(group, info)
            if not ginfo:
                print ("No info for group '{0}', skipping".format(group), file=sys.stderr)
                continue
            padid = m.group(2)
        else:
            group = None
            ginfo = None

        if not args.pretend:
            try:
                if ginfo:
                    os.makedirs(os.path.join(args.path, ginfo['name']))
                else:
                    os.makedirs(args.path)
            except OSError:
                pass

        retry = True
        tries = 1
        while retry:
            retry = False
            try:

                #                 _        
                #  _ __ ___   ___| |_ __ _ 
                # | '_ ` _ \ / _ \ __/ _` |
                # | | | | | |  __/ || (_| |
                # |_| |_| |_|\___|\__\__,_|

                meta_url = urlify(padid, ext=".json")
                raw_url = urlify(padid, ext=".txt")
                colors_url = urlify(padid, ext=".html")

                if ginfo:
                    meta_out = "{0}/{1}/{2}".format(args.path, ginfo['name'], meta_url.encode("utf-8"))
                    raw_out = "{0}/{1}/{2}".format(args.path, ginfo['name'], raw_url.encode("utf-8"))
                    colors_out = "{0}/{1}/{2}".format(args.path, ginfo['name'], colors_url.encode("utf-8"))
                else:
                    meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
                    raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
                    colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))

                if not args.pretend:
                    meta = {}
                    meta['padid'] = padid
                    revisions_url = apiurl+'getRevisionsCount?'+urlencode(data)
                    if args.showurls:
                        print (revisions_url, file=sys.stderr)
                    meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions']

                    # CHECK REVISIONS (against existing meta)
                    if meta['total_revisions'] == 0:
                        if args.verbose:
                            print ("  pad has no revisions, skipping", file=sys.stderr)
                        continue
                    if os.path.exists(meta_out):
                        with open(meta_out) as f:
                            old_meta = json.load(f)
                        if not args.force and old_meta['total_revisions'] == meta['total_revisions']:
                            if args.verbose:
                                print ("  skipping (up to date)", file=sys.stderr)
                            continue

                    lastedited_url = apiurl+'getLastEdited?'+urlencode(data)
                    if args.showurls:
                        print (lastedited_url, file=sys.stderr)
                    lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited']
                    meta['lastedited_raw'] = lastedited_raw
                    meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()

                    # author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)
                    authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data)
                    if args.showurls:
                        print (authors_url, file=sys.stderr)
                    meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
                    meta['colors'] = colors_url
                    meta['raw'] = raw_url
                    meta['meta'] = meta_url
                    # defer output to LAST STEP (as confirmation)

                #  _ __ __ ___      __
                # | '__/ _` \ \ /\ / /
                # | | | (_| |\ V  V / 
                # |_|  \__,_| \_/\_/  

                text_url = apiurl+"getText?"+urlencode(data)
                if args.showurls:
                    print (text_url, file=sys.stderr)
                if not args.pretend:
                    rawText = json.load(urlopen(text_url))['data']['text']
                    if rawText.strip() == "":
                        if args.verbose:
                            print ("  empty text, skipping", file=sys.stderr)
                        continue
                    if not args.hidepaths:
                        print (raw_out, file=sys.stderr)
                    with open(raw_out, "w") as f:
                        f.write(rawText.encode("utf-8"))

                #  _     _             _ 
                # | |__ | |_ _ __ ___ | |
                # | '_ \| __| '_ ` _ \| |
                # | | | | |_| | | | | | |
                # |_| |_|\__|_| |_| |_|_|

                # todo ? -- regular HTML output

                #            _                
                #   ___ ___ | | ___  _ __ ___ 
                #  / __/ _ \| |/ _ \| '__/ __|
                # | (_| (_) | | (_) | |  \__ \
                #  \___\___/|_|\___/|_|  |___/

                if not args.hidepaths:
                    print (colors_out, file=sys.stderr)
                data['startRev'] = "0"
                colors_url = apiurl+'createDiffHTML?'+urlencode(data)
                if args.showurls:
                    print (colors_url, file=sys.stderr)
                html = json.load(urlopen(colors_url))['data']['html']
                t = html5lib.parse(html, namespaceHTMLElements=False)
                trim_removed_spans(t)
                html = ET.tostring(t, method="html")

                # Stage 1: Process as text
                # Process [[wikilink]] style links
                # and (optionally) add linked page names to spider todo list
                if args.linkify:
                    html, links = linkify(html)
                    if args.spider:
                        for l in links:
                            if l not in todo and l not in done:
                                if l.startswith("http://") or l.startswith("https://"):
                                    if args.verbose:
                                        print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr)
                                    continue
                                # if args.verbose:
                                #     print ("  link: {0}".format(l), file=sys.stderr)
                                todo.append(l)

                # Stage 2: Process as ElementTree
                #
                t = html5lib.parse(html, namespaceHTMLElements=False)
                # apply linkpats
                for a in t.findall(".//a"):
                    href = a.attrib.get("href")
                    original_href = href
                    if href:
                        # if args.verbose:
                        #     print ("searching for PADLINK: {0}".format(href))
                        for pat in padlinkpats:
                            if re.search(pat, href) != None:
                                # if args.verbose:
                                #     print ("  found PADLINK: {0}".format(href))
                                href = re.sub(pat, "\\1.html", href)
                                padid = filename_to_padid(href)
                                set_text_contents(a, "[[{0}]]".format(padid))
                                if padid not in todo and padid not in done:
                                    if args.verbose:
                                        print ("  link: {0}".format(padid), file=sys.stderr)
                                    todo.append(padid)
                        # apply linkpats
                        for s, r in linkpats:
                            href = re.sub(s, r, href)
                            if href != original_href:
                                old_contents = text_contents(a)
                                # print ("OLD_CONTENTS {0}".format(old_contents))
                                if old_contents == original_href:
                                    if args.verbose:
                                        print ("   Updating href IN TEXT", file=sys.stderr)
                                    set_text_contents(a, href)

                        if original_href != href:
                            if args.verbose:
                                print ("  Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
                            a.attrib['href'] = href

                        # SHOWIMAGES : inject img tag for (local) images
                        if args.add_images:
                            ext = os.path.splitext(href)[1].lower().lstrip(".")
                            if ext in ("png", "gif", "jpeg", "jpg"):
                                # ap = _parent(a)
                                print ("Adding img '{0}'".format(href), file=sys.stderr)
                                img = ET.SubElement(a, "img")
                                br = ET.SubElement(a, "br")
                                a.remove(img); a.insert(0, img)
                                a.remove(br); a.insert(1, br)
                                img.attrib['src'] = href

                # extract the style tag (with authorship colors)
                style = t.find(".//style")
                if style != None:
                    if args.authors_css:
                        for i in style.text.splitlines(): 
                            if len(i):
                                selector, rule = i.split(' ',1)
                                authors_css_rules[selector] = rule
                        # replace individual style with a ref to the authors-css
                        style = '<link rel="stylesheet" type="text/css" href="{0}">'.format(args.authors_css)
                    else:
                        style = ET.tostring(style, method="html")
                else:
                    style = ""
                # and extract the contents of the body
                html = contents(t.find(".//body"))

                if not args.pretend:
                    with open(colors_out, "w") as f:
                        # f.write(html.encode("utf-8"))
                        f.write(colors_template.render(
                            html = html,
                            style = style,
                            revision = meta['total_revisions'],
                            padid = padid,
                            timestamp = datetime.now(),
                            meta_url = meta_url,
                            raw_url = raw_url,
                            colors_url = colors_url,
                            lastedited = meta['lastedited']
                        ).encode("utf-8"))

                    # OUTPUT METADATA (finally)
                    if not args.hidepaths:
                        print (meta_out, file=sys.stderr)
                    with open(meta_out, "w") as f:
                        json.dump(meta, f)
                #  _                   
                # | | ___   ___  _ __  
                # | |/ _ \ / _ \| '_ \ 
                # | | (_) | (_) | |_) |
                # |_|\___/ \___/| .__/ 
                #               |_|    

                count += 1
                if args.limit and count >= args.limit:
                    break

            # except HTTPError as e:
            #     retry = True
                    
            # except TypeError as e:
            #     print ("TypeError, skipping!", file=sys.stderr)

            except Exception as e:
                print ("[{0}] Exception: {1}".format(tries, e), file=sys.stderr)
                sleep(3)
                retry = True

            if retry:
                tries += 1
                if tries > 5:
                    print ("  GIVING UP", file=sys.stderr)
                    retry = False


    # Write the unified CSS with authors
    if args.authors_css:
        authors_css_path = os.path.join(args.path, args.authors_css)
        print (authors_css_path, file=sys.stderr)
        with open(authors_css_path, 'w') as css:
            for selector, rule in sorted(authors_css_rules.items()):
                css.write(selector+' '+rule+'\n')
subcommands 2015-09-17 18:23:18 +02:00			`#!/usr/bin/env python`
			`# License: AGPL`
			`#`
continued tweaks 2015-09-19 11:43:16 +02:00			`#`
			`# todo:`
			`# Capture exceptions... add HTTP status errors (502) to meta!!`
			`# so that an eventual index can show the problematic pages!`
			`# Also: provide links to text only / html versions when diff HTML fails`
subcommands 2015-09-17 18:23:18 +02:00
			`from __future__ import print_function`
			`from etherdump import DATAPATH`

			`# stdlib`
			`import json, sys, os, re`
			`from argparse import ArgumentParser`
			`from datetime import datetime`
			`from xml.etree import cElementTree as ET`
			`from urllib import urlencode`
			`from urllib2 import urlopen, HTTPError, URLError`
continued tweaks 2015-09-19 11:43:16 +02:00			`from time import sleep`
subcommands 2015-09-17 18:23:18 +02:00
			`# external dependencies (use pip to install these)`
			`import html5lib, jinja2`


			`def filename_to_padid (t):`
			`t = t.replace("_", " ")`
			`t = re.sub(r"\.html$", "", t)`
			`return t`

			`def normalize_pad_name (n):`
			`if '?' in n:`
			`n = n.split('?', 1)[0]`
			`if '/' in n:`
			`n = n.split('/', 1)[0]`
			`return n`

			`def urlify (t, ext=".html"):`
			`return t.replace(" ", "_") + ext`

			`def linkify (src, urlify=urlify):`

			`collect = []`

			`def s (m):`
			`contents = strip_tags(m.group(1))`
			`contents = normalize_pad_name(contents)`
			`collect.append(contents)`
			`link = urlify(contents)`
			`# link = link.split("?", 1)[0]`
			`return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)`

			`# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)`
			`## question marks are ignored by etherpad, so split/strip it`
			`## strip slashes as well!! (/timeslider)`
			`src = re.sub(r"\[\[(.+?)\]\]", s, src)`
			`return (src, collect)`

			`def strip_tags (text):`
			`return re.sub(r"<.*?>", "", text)`

			`def set_text_contents (element, text):`
			`""" ok this isn't really general, but works for singly wrapped elements """`
			`while len(element) == 1:`
			`element = element[0]`
			`element.text = text`

			`def text_contents (element):`
			`return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')`

			`def contents (element, method="html"):`
			`return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])`

			`def get_parent(tree, elt):`
			`for parent in tree.iter():`
			`for child in parent:`
			`if child == elt:`
			`return parent`

			`def remove_recursive (tree, elt):`
			`""" Remove element and (any resulting) empty containing elements """`
			`p = get_parent(tree, elt)`
			`if p:`
			`p.remove(elt)`
			`if len(p) == 0 and (p.text == None or p.text.strip() == ""):`
			`# print ("empty parent", p, file=sys.stderr)`
			`remove_recursive(tree, p)`


			`def trim_removed_spans (t):`
			`# remove <span class="removed"> and empty parents`
			`for n in t.findall(".//span[@class='removed']"):`
			`remove_recursive(t, n)`
			`# then strip any leading br's from body`
			`while True:`
			`tag = t.find("./body")[0]`
			`if tag.tag == "br":`
			`remove_recursive(t, tag)`
			`else:`
			`break`

			`def get_template_env (tpath=None):`
			`paths = []`
			`if tpath and os.path.isdir(tpath):`
			`paths.append(tpath)`
			`# paths.append(TEMPLATES_PATH)`
			`loader = jinja2.FileSystemLoader(paths)`
			`env = jinja2.Environment(loader=loader)`
			`return env`

continued tweaks 2015-09-19 11:43:16 +02:00			`def get_group_info(gid, info):`
			`if 'groups' in info:`
			`if gid in info['groups']:`
			`return info['groups'][gid]`

subcommands 2015-09-17 18:23:18 +02:00			`def main(args):`
			`p = ArgumentParser("""`
			`_ _ _`
			`___\| \|_\| \|__ ___ _ __ __\| \|_ _ _ __ ___ _ __`
			/ _ \ __\| '_ \ / _ \ '__/ _` \| \| \| \| '_ ` _ \\| '_ \
			`\| __/ \|_\| \| \| \| __/ \| \| (_\| \| \|_\| \| \| \| \| \| \| \|_) \|`
			`\___\|\__\|_\| \|_\|\___\|_\| \__,_\|\__,_\|_\| \|_\| \|_\| .__/`
			`\|_\|`
			`""")`
			`p.add_argument("padid", default=[], nargs="*", help="the padid(s) to process")`
			`p.add_argument("--padinfo", default="padinfo.json", help="JSON file with login data for the pad (url, apikey etc), default: padinfo.json")`
			`p.add_argument("--path", default="output", help="path to save files, default: output")`
			`p.add_argument("--verbose", default=False, action="store_true", help="flag for verbose output")`
			`p.add_argument("--limit", type=int, default=None)`
			`p.add_argument("--allpads", default=False, action="store_true", help="flag to process all pads")`
			`p.add_argument("--templatepath", default=os.path.join(DATAPATH, "templates"), help="directory with templates (override default files)")`
			`p.add_argument("--colors-template", default="pad_colors.html", help="pad with authorship colors template name: pad_colors.html")`
			`p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")`
			`p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")`
			`p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")`
			`p.add_argument("--showurls", default=False, action="store_true", help="flag to display API URLs that are used (to stderr)")`
			`p.add_argument("--hidepaths", default=False, action="store_true", help="flag to not display paths")`
			`p.add_argument("--pretend", default=False, action="store_true", help="flag to not actually save")`
continued tweaks 2015-09-19 11:43:16 +02:00			`p.add_argument("--linkify", default=False, action="store_true", help="flag to process [[link]] forms (and follow when --spider is used)")`
			`p.add_argument("--spider", default=False, action="store_true", help="flag to spider pads (requires --linkify)")`
subcommands 2015-09-17 18:23:18 +02:00			`p.add_argument("--add-images", default=False, action="store_true", help="flag to add image tags")`
continued tweaks 2015-09-19 11:43:16 +02:00			`p.add_argument("--force", default=False, action="store_true", help="force dump (even if not updated since last dump)")`
			`p.add_argument("--authors-css", default=None, help="filename to save collected authorship css (nb: any existing file will be mercilessly overwritten), default: don't accumulate css")`
subcommands 2015-09-17 18:23:18 +02:00
			`# TODO css from pad --- ie specify a padid for a stylesheet!!!!!!`
			`# p.add_argument("--css", default="styles.css", help="padid of stylesheet")`

			`args = p.parse_args(args)`
			`with open(args.padinfo) as f:`
			`info = json.load(f)`

			`apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)`

			`# padlinkpats are for mapping internal pad links`
			`# linkpats are any other link replacements, both are regexps`

			`padlinkpats = []`
			`linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]`
			`linkpats.extend(zip(args.linksearch, args.linkreplace))`
			`if "padlink" in info:`
			`if type(info['padlink']) == list:`
			`padlinkpats.extend(info['padlink'])`
			`else:`
			`padlinkpats.append(info['padlink'])`
			`padlinkpats.extend(args.padlink)`

			`env = get_template_env(args.templatepath)`
			`colors_template = env.get_template(args.colors_template)`

			`todo = args.padid`
			`done = set()`
			`count = 0`
			`data = {}`
			`authors_css_rules = {}`
			`data['apikey'] = info['apikey']`

			`if args.allpads:`
			`# push the list of all pad names on to todo`
			`list_url = apiurl+'listAllPads?'+urlencode(data)`
			`if args.showurls:`
			`print (list_url, file=sys.stderr)`
			`results = json.load(urlopen(list_url))['data']['padIDs']`
			`todo.extend(results)`

			`while len(todo) > 0:`
			`padid = todo[0]`
			`todo = todo[1:]`
			`done.add(padid)`

			`data['padID'] = padid.encode("utf-8")`
continued tweaks 2015-09-19 11:43:16 +02:00
subcommands 2015-09-17 18:23:18 +02:00			`if args.verbose:`
continued tweaks 2015-09-19 11:43:16 +02:00			`print (u"PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)`

			`# g.yIRLMysh0PMsCMHc$`
			`grouppat = re.compile(ur"^g\.(\w+)\$(.+)$")`
			`m = grouppat.search(padid)`
			`if m:`
			`group = m.group(1)`
			`ginfo = get_group_info(group, info)`
			`if not ginfo:`
			`print ("No info for group '{0}', skipping".format(group), file=sys.stderr)`
			`continue`
			`padid = m.group(2)`
			`else:`
			`group = None`
			`ginfo = None`

subcommands 2015-09-17 18:23:18 +02:00			`if not args.pretend:`
			`try:`
continued tweaks 2015-09-19 11:43:16 +02:00			`if ginfo:`
			`os.makedirs(os.path.join(args.path, ginfo['name']))`
			`else:`
			`os.makedirs(args.path)`
subcommands 2015-09-17 18:23:18 +02:00			`except OSError:`
			`pass`

continued tweaks 2015-09-19 11:43:16 +02:00			`retry = True`
			`tries = 1`
			`while retry:`
			`retry = False`
			`try:`
subcommands 2015-09-17 18:23:18 +02:00
continued tweaks 2015-09-19 11:43:16 +02:00			`# _`
			`# _ __ ___ ___\| \|_ __ _`
			# \| '_ ` _ \ / _ \ __/ _` \|
			`# \| \| \| \| \| \| __/ \|\| (_\| \|`
			`# \|_\| \|_\| \|_\|\___\|\__\__,_\|`
subcommands 2015-09-17 18:23:18 +02:00
continued tweaks 2015-09-19 11:43:16 +02:00			`meta_url = urlify(padid, ext=".json")`
			`raw_url = urlify(padid, ext=".txt")`
			`colors_url = urlify(padid, ext=".html")`

			`if ginfo:`
			`meta_out = "{0}/{1}/{2}".format(args.path, ginfo['name'], meta_url.encode("utf-8"))`
			`raw_out = "{0}/{1}/{2}".format(args.path, ginfo['name'], raw_url.encode("utf-8"))`
			`colors_out = "{0}/{1}/{2}".format(args.path, ginfo['name'], colors_url.encode("utf-8"))`
			`else:`
			`meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))`
			`raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))`
			`colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))`

			`if not args.pretend:`
			`meta = {}`
			`meta['padid'] = padid`
			`revisions_url = apiurl+'getRevisionsCount?'+urlencode(data)`
			`if args.showurls:`
			`print (revisions_url, file=sys.stderr)`
			`meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions']`

			`# CHECK REVISIONS (against existing meta)`
			`if meta['total_revisions'] == 0:`
			`if args.verbose:`
			`print (" pad has no revisions, skipping", file=sys.stderr)`
			`continue`
			`if os.path.exists(meta_out):`
			`with open(meta_out) as f:`
			`old_meta = json.load(f)`
			`if not args.force and old_meta['total_revisions'] == meta['total_revisions']:`
subcommands 2015-09-17 18:23:18 +02:00			`if args.verbose:`
continued tweaks 2015-09-19 11:43:16 +02:00			`print (" skipping (up to date)", file=sys.stderr)`
subcommands 2015-09-17 18:23:18 +02:00			`continue`
continued tweaks 2015-09-19 11:43:16 +02:00
			`lastedited_url = apiurl+'getLastEdited?'+urlencode(data)`
			`if args.showurls:`
			`print (lastedited_url, file=sys.stderr)`
			`lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited']`
			`meta['lastedited_raw'] = lastedited_raw`
			`meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()`

			`# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)`
			`authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data)`
			`if args.showurls:`
			`print (authors_url, file=sys.stderr)`
			`meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']`
			`meta['colors'] = colors_url`
			`meta['raw'] = raw_url`
			`meta['meta'] = meta_url`
			`# defer output to LAST STEP (as confirmation)`

			`# _ __ __ ___ __`
			# \| '__/ _` \ \ /\ / /
			`# \| \| \| (_\| \|\ V V /`
			`# \|_\| \__,_\| \_/\_/`

			`text_url = apiurl+"getText?"+urlencode(data)`
			`if args.showurls:`
			`print (text_url, file=sys.stderr)`
			`if not args.pretend:`
			`rawText = json.load(urlopen(text_url))['data']['text']`
			`if rawText.strip() == "":`
subcommands 2015-09-17 18:23:18 +02:00			`if args.verbose:`
continued tweaks 2015-09-19 11:43:16 +02:00			`print (" empty text, skipping", file=sys.stderr)`
			`continue`
			`if not args.hidepaths:`
			`print (raw_out, file=sys.stderr)`
			`with open(raw_out, "w") as f:`
			`f.write(rawText.encode("utf-8"))`

			`# _ _ _`
			`# \| \|__ \| \|_ _ __ ___ \| \|`
			# \| '_ \\| __\| '_ ` _ \\| \|
			`# \| \| \| \| \|_\| \| \| \| \| \| \|`
			`# \|_\| \|_\|\__\|_\| \|_\| \|_\|_\|`

			`# todo ? -- regular HTML output`

			`# _`
			`# ___ ___ \| \| ___ _ __ ___`
			`# / __/ _ \\| \|/ _ \\| '__/ __\|`
			`# \| (_\| (_) \| \| (_) \| \| \__ \`
			`# \___\___/\|_\|\___/\|_\| \|___/`

			`if not args.hidepaths:`
			`print (colors_out, file=sys.stderr)`
			`data['startRev'] = "0"`
			`colors_url = apiurl+'createDiffHTML?'+urlencode(data)`
			`if args.showurls:`
			`print (colors_url, file=sys.stderr)`
			`html = json.load(urlopen(colors_url))['data']['html']`
			`t = html5lib.parse(html, namespaceHTMLElements=False)`
			`trim_removed_spans(t)`
			`html = ET.tostring(t, method="html")`

			`# Stage 1: Process as text`
			`# Process [[wikilink]] style links`
			`# and (optionally) add linked page names to spider todo list`
			`if args.linkify:`
			`html, links = linkify(html)`
			`if args.spider:`
			`for l in links:`
			`if l not in todo and l not in done:`
			`if l.startswith("http://") or l.startswith("https://"):`
			`if args.verbose:`
			`print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr)`
			`continue`
			`# if args.verbose:`
			`# print (" link: {0}".format(l), file=sys.stderr)`
			`todo.append(l)`

			`# Stage 2: Process as ElementTree`
			`#`
			`t = html5lib.parse(html, namespaceHTMLElements=False)`
			`# apply linkpats`
			`for a in t.findall(".//a"):`
			`href = a.attrib.get("href")`
			`original_href = href`
			`if href:`
			`# if args.verbose:`
			`# print ("searching for PADLINK: {0}".format(href))`
			`for pat in padlinkpats:`
			`if re.search(pat, href) != None:`
			`# if args.verbose:`
			`# print (" found PADLINK: {0}".format(href))`
			`href = re.sub(pat, "\\1.html", href)`
			`padid = filename_to_padid(href)`
			`set_text_contents(a, "[[{0}]]".format(padid))`
			`if padid not in todo and padid not in done:`
			`if args.verbose:`
			`print (" link: {0}".format(padid), file=sys.stderr)`
			`todo.append(padid)`
			`# apply linkpats`
			`for s, r in linkpats:`
			`href = re.sub(s, r, href)`
			`if href != original_href:`
			`old_contents = text_contents(a)`
			`# print ("OLD_CONTENTS {0}".format(old_contents))`
			`if old_contents == original_href:`
			`if args.verbose:`
			`print (" Updating href IN TEXT", file=sys.stderr)`
			`set_text_contents(a, href)`

			`if original_href != href:`
			`if args.verbose:`
			`print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)`
			`a.attrib['href'] = href`

			`# SHOWIMAGES : inject img tag for (local) images`
			`if args.add_images:`
			`ext = os.path.splitext(href)[1].lower().lstrip(".")`
			`if ext in ("png", "gif", "jpeg", "jpg"):`
			`# ap = _parent(a)`
			`print ("Adding img '{0}'".format(href), file=sys.stderr)`
			`img = ET.SubElement(a, "img")`
			`br = ET.SubElement(a, "br")`
			`a.remove(img); a.insert(0, img)`
			`a.remove(br); a.insert(1, br)`
			`img.attrib['src'] = href`

			`# extract the style tag (with authorship colors)`
			`style = t.find(".//style")`
			`if style != None:`
			`if args.authors_css:`
			`for i in style.text.splitlines():`
			`if len(i):`
			`selector, rule = i.split(' ',1)`
			`authors_css_rules[selector] = rule`
			`# replace individual style with a ref to the authors-css`
			`style = '<link rel="stylesheet" type="text/css" href="{0}">'.format(args.authors_css)`
			`else:`
			`style = ET.tostring(style, method="html")`
subcommands 2015-09-17 18:23:18 +02:00			`else:`
continued tweaks 2015-09-19 11:43:16 +02:00			`style = ""`
			`# and extract the contents of the body`
			`html = contents(t.find(".//body"))`

			`if not args.pretend:`
			`with open(colors_out, "w") as f:`
			`# f.write(html.encode("utf-8"))`
			`f.write(colors_template.render(`
			`html = html,`
			`style = style,`
			`revision = meta['total_revisions'],`
			`padid = padid,`
			`timestamp = datetime.now(),`
			`meta_url = meta_url,`
			`raw_url = raw_url,`
			`colors_url = colors_url,`
			`lastedited = meta['lastedited']`
			`).encode("utf-8"))`

			`# OUTPUT METADATA (finally)`
			`if not args.hidepaths:`
			`print (meta_out, file=sys.stderr)`
			`with open(meta_out, "w") as f:`
			`json.dump(meta, f)`
			`# _`
			`# \| \| ___ ___ _ __`
			`# \| \|/ _ \ / _ \\| '_ \`
			`# \| \| (_) \| (_) \| \|_) \|`
			`# \|_\|\___/ \___/\| .__/`
			`# \|_\|`

			`count += 1`
			`if args.limit and count >= args.limit:`
			`break`

			`# except HTTPError as e:`
			`# retry = True`

			`# except TypeError as e:`
			`# print ("TypeError, skipping!", file=sys.stderr)`

			`except Exception as e:`
			`print ("[{0}] Exception: {1}".format(tries, e), file=sys.stderr)`
			`sleep(3)`
			`retry = True`

			`if retry:`
			`tries += 1`
			`if tries > 5:`
			`print (" GIVING UP", file=sys.stderr)`
			`retry = False`

subcommands 2015-09-17 18:23:18 +02:00
			`# Write the unified CSS with authors`
			`if args.authors_css:`
fixed output path of authors.css 2015-09-17 18:30:56 +02:00			`authors_css_path = os.path.join(args.path, args.authors_css)`
			`print (authors_css_path, file=sys.stderr)`
			`with open(authors_css_path, 'w') as css:`
subcommands 2015-09-17 18:23:18 +02:00			`for selector, rule in sorted(authors_css_rules.items()):`
			`css.write(selector+' '+rule+'\n')`