From 5f7d718d688f89c0b66eea5f34f94fa78ab737d4 Mon Sep 17 00:00:00 2001 From: Thomas Gideon Date: Wed, 29 Sep 2010 12:32:16 -0400 Subject: [PATCH] Added comments. --- append.py | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 84 insertions(+), 7 deletions(-) diff --git a/append.py b/append.py index 3c0a09c..7895203 100755 --- a/append.py +++ b/append.py @@ -1,4 +1,34 @@ #!/usr/bin/python +# +# append.py - Compares the podcast category feed for the podcast's web site +# with format specific feeds, adding the newest episode if missing. +# +# All project files are made available under the following, BSD-new license. +# +# Copyright (c) 2010, Thomas Gideon +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Thomas Gideon nor the +# names of additional contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys import feedparser import urllib2 @@ -13,6 +43,7 @@ import datetime def __fetch_feed(url): + """ Pull the feed and parse it, logging any errors. """ try: return feedparser.parse(url) except HTTPError, e: @@ -25,6 +56,11 @@ def __fetch_feed(url): def __append(feed, suffix, append_fn): + """ + For the given main site feed, load the appropriate media specific feed + and compare. If the latest episode isn't in the media specific feed, + insert it making the necessary adjustments to the new episode's entry. + """ latest = __fetch_feed('cmdln_%s.xml' % suffix).entries[0] entry = feed.entries[0] if latest.title.find(entry.title) != -1: @@ -56,11 +92,12 @@ def __append(feed, suffix, append_fn): def __append_non_itunes(entry, output, suffix, base_url): + """ + For most of the feeds, new episodes are simple stanzas and the + adjustments consist mostly of copying what is in the mean site feed's + entry and just re-writing the enclosure to the appropriate media file. + """ (url, mime_type, size) = __enclosure(entry.enclosures, base_url, suffix) - # Google listen won't play 'application/ogg' and that mime type is currently - # returned by archive.org for Ogg Vorbis files - if 'ogg' == suffix: - mime_type = 'audio/ogg' output.write(""" %(title)s (Comment Line 240-949-2638) %(link)s @@ -81,6 +118,13 @@ def __append_non_itunes(entry, output, suffix, base_url): def __append_itunes(entry, output, suffix, base_url): + """ + For the iTunes/AAC feed, there are some additional elements that make + use of the Apple extensions to RSS. Some of these, like the duration, + author and subtitle, can be copied as is. The description and summary + produced by PodPress is less than desirable so those get munged to + something more suitable before writing into the iTunes feed. + """ description = __description(entry.content) soup = BeautifulSoup(description) summary = '\n\n'.join([''.join(p.findAll(text=True)) for p in soup.findAll('p')]) @@ -115,6 +159,13 @@ def __append_itunes(entry, output, suffix, base_url): def __permalink(title): + """ + PodPress uses the opaque permalink from WordPress, basically just a + base url with a minimal query string containing the post's internal ID. + The OS X app used to maintain these feeds previously, Feeder, munged + the title into a nice, readable slug. This function reproduces what + Feed does to populate the permalink element in the feed entry. + """ permalink = title.lower() permalink = re.sub('-', '', permalink) permalink = re.sub('[^a-z0-9]', '-', permalink) @@ -125,6 +176,15 @@ def __permalink(title): def __description(content): + """ + This function strips out parts of the description used in the main site + feed that are less appropriate for the media specific feeds. PodPress + leaves a blank paragraph where its Flash player renders. The main + site's episodes have some extra verbiage after the license image and + links, namely the sharing and relate posts plugin output. A simple, + bare link is added to the last paragraph for the benefit of aggregators + that may strip out HTML. + """ description = content[0].value description = re.sub('

\n', '', description) description = re.sub(re.compile('License.

.*$', re.M | re.S), 'License.

', description) @@ -133,10 +193,19 @@ def __description(content): def __enclosure(enclosures, base_url, suffix): + """ + Uses the file name from the main site's enclosure plus the base_url to + pull together values to re-write the attributes for the correct media. + """ m = re.search('cmdln.net_[0-9]{4}-[0-9]{2}-[0-9]{2}', enclosures[0].href) url = '%s/%s.%s' % (base_url, m.group(), suffix) usock = urllib2.urlopen(url) - mime_type = usock.info().type + # Google listen won't play 'application/ogg' and that mime type is currently + # returned by archive.org for Ogg Vorbis files + if 'ogg' == suffix: + mime_type = 'audio/ogg' + else: + mime_type = usock.info().type size = usock.info().get('Content-Length') if size is None: size = 0 @@ -144,6 +213,11 @@ def __enclosure(enclosures, base_url, suffix): def __archive_slug(title): + """ + The Internet Archive transforms the title field of new entries in a + specific way. This is my reverse engineering of their algorithm based + on their description and empirical data from dozens of uploads. + """ slug = re.sub('\([^0-9]\)-\([^0-9]\)', '\1\2', title) slug = re.sub('[^A-Za-z0-9\-]', ' ', slug) slug = re.sub(' {2,}', ' ', slug) @@ -153,9 +227,11 @@ def __archive_slug(title): return slug -def main(): +def __main(): logging.basicConfig(level=logging.INFO, format='%(message)s') + # pulls the category feed from the web site which will have just the most recent episodes + # along with all the iTunes jiggery-pokery PodPress performs feed = __fetch_feed('http://thecommandline.net/category/podcast/feed/') if feed is None: logging.error('Failed to fetch feed.') @@ -164,8 +240,9 @@ def main(): __append(feed, 'mp3', __append_non_itunes) __append(feed, 'ogg', __append_non_itunes) __append(feed, 'm4a', __append_itunes) + # TODO add flac if __name__ == "__main__": - main() + __main()