manymanymany-varia-websites/plugins/pelican_comment_system/import/blogger_comment_export.py

#! python3.6
"""
Export Comments from BLogger XML

Takes in a Blogger export XML file and spits out each comment in a seperate
file, such that can be used with the [Pelican Comment System]
(https://bernhard.scheirle.de/posts/2014/March/29/static-comments-via-email/).

May be simple to extend to export posts as well.

For a more detailed desciption, read my blog post at
    http://blog.minchin.ca/2016/12/blogger-comments-exported.html

Author: Wm. Minchin -- minchinweb@gmail.com
License: MIT
Changes:

 - 2016.12.29 -- initial release
 - 2017.01.10 -- clean-up for addition in Pelican Comment System repo
"""

from pathlib import Path

import untangle

###############################################################################
# Constants                                                                   #
###############################################################################

BLOGGER_EXPORT = r'c:\tmp\blog.xml'
COMMENTS_DIR = 'comments'
COMMENT_EXT = '.md'
AUTHORS_FILENAME = 'authors.txt'

###############################################################################
# Main Code Body                                                              #
###############################################################################

authors_and_pics = []


def main():
    obj = untangle.parse(BLOGGER_EXPORT)

    templates = 0
    posts = 0
    comments = 0
    settings = 0
    others = 0

    for entry in obj.feed.entry:
        try:
            full_type = entry.category['term']
        except TypeError:
            # if a post is under multiple categories
            for my_category in entry.category:
                full_type = my_category['term']
                # str.find() uses a return of `-1` to denote failure
                if full_type.find('#') != -1:
                    break
            else:
                others += 1

        simple_type = full_type[full_type.find('#')+1:]

        if 'settings' == simple_type:
            settings += 1
        elif 'post' == simple_type:
            posts += 1
            # process posts here
        elif 'comment' == simple_type:
            comments += 1
            process_comment(entry, obj)
        elif 'template' == simple_type:
            templates += 1
        else:
            others += 1

    export_authors()

    print('''
            {} template
            {} posts (including drafts)
            {} comments
            {} settings
            {} other entries'''.format(templates,
                                       posts,
                                       comments,
                                       settings,
                                       others))


def process_comment(entry, obj):
    # e.g. "tag:blogger.com,1999:blog-26967745.post-4115122471434984978"
    comment_id = entry.id.cdata
    # in ISO 8601 format, usable as is
    comment_published = entry.published.cdata
    comment_body = entry.content.cdata
    comment_post_id = entry.thr_in_reply_to['ref']
    comment_author = entry.author.name.cdata
    comment_author_pic = entry.author.gd_image['src']
    comment_author_email = entry.author.email.cdata

    # add author and pic to global list
    global authors_and_pics
    authors_and_pics.append((comment_author, comment_author_pic))

    # use this for a filename for the comment
    # e.g. "4115122471434984978"
    comment_short_id = comment_id[comment_id.find('post-')+5:]

    comment_text = "date: {}\nauthor: {}\nemail: {}\n\n{}\n"\
                        .format(comment_published,
                                comment_author,
                                comment_author_email,
                                comment_body)

    # article
    for entry in obj.feed.entry:
        entry_id = entry.id.cdata
        if entry_id == comment_post_id:
            article_entry = entry
            break
    else:
        print("No matching article for comment", comment_id, comment_post_id)
        # don't process comment further
        return

    # article slug
    for link in article_entry.link:
        if link['rel'] == 'alternate':
            article_link = link['href']
            break
    else:
        article_title = article_entry.title.cdata
        print('Could not find slug for', article_title)
        article_link = article_title.lower().replace(' ', '-')

    article_slug = article_link[article_link.rfind('/')+1:
                                                    article_link.find('.html')]

    comment_filename = Path(COMMENTS_DIR).resolve()
    # folder; if it doesn't exist, create it
    comment_filename = comment_filename / article_slug
    comment_filename.mkdir(parents=True, exist_ok=True)
    # write the comment file
    comment_filename = comment_filename / (comment_short_id + COMMENT_EXT)
    comment_filename.write_text(comment_text)


def export_authors():
    to_export = set(authors_and_pics)
    to_export = list(to_export)
    to_export.sort()

    str_export = ''
    for i in to_export:
        str_export += (i[0] + '\t\t' + i[1] + '\n')

    authors_filename = Path(COMMENTS_DIR).resolve() / AUTHORS_FILENAME
    authors_filename.write_text(str_export)


if __name__ == "__main__":
    main()
adding the 2 submodules again 7 years ago			`#! python3.6`
			`"""`
			`Export Comments from BLogger XML`

			`Takes in a Blogger export XML file and spits out each comment in a seperate`
			`file, such that can be used with the [Pelican Comment System]`
			`(https://bernhard.scheirle.de/posts/2014/March/29/static-comments-via-email/).`

			`May be simple to extend to export posts as well.`

			`For a more detailed desciption, read my blog post at`
			`http://blog.minchin.ca/2016/12/blogger-comments-exported.html`

			`Author: Wm. Minchin -- minchinweb@gmail.com`
			`License: MIT`
			`Changes:`

			`- 2016.12.29 -- initial release`
			`- 2017.01.10 -- clean-up for addition in Pelican Comment System repo`
			`"""`

			`from pathlib import Path`

			`import untangle`

			`###############################################################################`
			`# Constants #`
			`###############################################################################`

			`BLOGGER_EXPORT = r'c:\tmp\blog.xml'`
			`COMMENTS_DIR = 'comments'`
			`COMMENT_EXT = '.md'`
			`AUTHORS_FILENAME = 'authors.txt'`

			`###############################################################################`
			`# Main Code Body #`
			`###############################################################################`

			`authors_and_pics = []`


			`def main():`
			`obj = untangle.parse(BLOGGER_EXPORT)`

			`templates = 0`
			`posts = 0`
			`comments = 0`
			`settings = 0`
			`others = 0`

			`for entry in obj.feed.entry:`
			`try:`
			`full_type = entry.category['term']`
			`except TypeError:`
			`# if a post is under multiple categories`
			`for my_category in entry.category:`
			`full_type = my_category['term']`
			# str.find() uses a return of `-1` to denote failure
			`if full_type.find('#') != -1:`
			`break`
			`else:`
			`others += 1`

			`simple_type = full_type[full_type.find('#')+1:]`

			`if 'settings' == simple_type:`
			`settings += 1`
			`elif 'post' == simple_type:`
			`posts += 1`
			`# process posts here`
			`elif 'comment' == simple_type:`
			`comments += 1`
			`process_comment(entry, obj)`
			`elif 'template' == simple_type:`
			`templates += 1`
			`else:`
			`others += 1`

			`export_authors()`

			`print('''`
			`{} template`
			`{} posts (including drafts)`
			`{} comments`
			`{} settings`
			`{} other entries'''.format(templates,`
			`posts,`
			`comments,`
			`settings,`
			`others))`


			`def process_comment(entry, obj):`
			`# e.g. "tag:blogger.com,1999:blog-26967745.post-4115122471434984978"`
			`comment_id = entry.id.cdata`
			`# in ISO 8601 format, usable as is`
			`comment_published = entry.published.cdata`
			`comment_body = entry.content.cdata`
			`comment_post_id = entry.thr_in_reply_to['ref']`
			`comment_author = entry.author.name.cdata`
			`comment_author_pic = entry.author.gd_image['src']`
			`comment_author_email = entry.author.email.cdata`

			`# add author and pic to global list`
			`global authors_and_pics`
			`authors_and_pics.append((comment_author, comment_author_pic))`

			`# use this for a filename for the comment`
			`# e.g. "4115122471434984978"`
			`comment_short_id = comment_id[comment_id.find('post-')+5:]`

			`comment_text = "date: {}\nauthor: {}\nemail: {}\n\n{}\n"\`
			`.format(comment_published,`
			`comment_author,`
			`comment_author_email,`
			`comment_body)`

			`# article`
			`for entry in obj.feed.entry:`
			`entry_id = entry.id.cdata`
			`if entry_id == comment_post_id:`
			`article_entry = entry`
			`break`
			`else:`
			`print("No matching article for comment", comment_id, comment_post_id)`
			`# don't process comment further`
			`return`

			`# article slug`
			`for link in article_entry.link:`
			`if link['rel'] == 'alternate':`
			`article_link = link['href']`
			`break`
			`else:`
			`article_title = article_entry.title.cdata`
			`print('Could not find slug for', article_title)`
			`article_link = article_title.lower().replace(' ', '-')`

			`article_slug = article_link[article_link.rfind('/')+1:`
			`article_link.find('.html')]`

			`comment_filename = Path(COMMENTS_DIR).resolve()`
			`# folder; if it doesn't exist, create it`
			`comment_filename = comment_filename / article_slug`
			`comment_filename.mkdir(parents=True, exist_ok=True)`
			`# write the comment file`
			`comment_filename = comment_filename / (comment_short_id + COMMENT_EXT)`
			`comment_filename.write_text(comment_text)`


			`def export_authors():`
			`to_export = set(authors_and_pics)`
			`to_export = list(to_export)`
			`to_export.sort()`

			`str_export = ''`
			`for i in to_export:`
			`str_export += (i[0] + '\t\t' + i[1] + '\n')`

			`authors_filename = Path(COMMENTS_DIR).resolve() / AUTHORS_FILENAME`
			`authors_filename.write_text(str_export)`


			`if __name__ == "__main__":`
			`main()`