Added comments.

This commit is contained in:
Thomas Gideon 2010-09-29 12:32:16 -04:00
parent ec0bc0569f
commit 5f7d718d68

View file

@ -1,4 +1,34 @@
#!/usr/bin/python #!/usr/bin/python
#
# append.py - Compares the podcast category feed for the podcast's web site
# with format specific feeds, adding the newest episode if missing.
#
# All project files are made available under the following, BSD-new license.
#
# Copyright (c) 2010, Thomas Gideon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Thomas Gideon nor the
# names of additional contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys import sys
import feedparser import feedparser
import urllib2 import urllib2
@ -13,6 +43,7 @@ import datetime
def __fetch_feed(url): def __fetch_feed(url):
""" Pull the feed and parse it, logging any errors. """
try: try:
return feedparser.parse(url) return feedparser.parse(url)
except HTTPError, e: except HTTPError, e:
@ -25,6 +56,11 @@ def __fetch_feed(url):
def __append(feed, suffix, append_fn): def __append(feed, suffix, append_fn):
"""
For the given main site feed, load the appropriate media specific feed
and compare. If the latest episode isn't in the media specific feed,
insert it making the necessary adjustments to the new episode's entry.
"""
latest = __fetch_feed('cmdln_%s.xml' % suffix).entries[0] latest = __fetch_feed('cmdln_%s.xml' % suffix).entries[0]
entry = feed.entries[0] entry = feed.entries[0]
if latest.title.find(entry.title) != -1: if latest.title.find(entry.title) != -1:
@ -56,11 +92,12 @@ def __append(feed, suffix, append_fn):
def __append_non_itunes(entry, output, suffix, base_url): def __append_non_itunes(entry, output, suffix, base_url):
"""
For most of the feeds, new episodes are simple stanzas and the
adjustments consist mostly of copying what is in the mean site feed's
entry and just re-writing the enclosure to the appropriate media file.
"""
(url, mime_type, size) = __enclosure(entry.enclosures, base_url, suffix) (url, mime_type, size) = __enclosure(entry.enclosures, base_url, suffix)
# Google listen won't play 'application/ogg' and that mime type is currently
# returned by archive.org for Ogg Vorbis files
if 'ogg' == suffix:
mime_type = 'audio/ogg'
output.write(""" <item> output.write(""" <item>
<title>%(title)s (Comment Line 240-949-2638)</title> <title>%(title)s (Comment Line 240-949-2638)</title>
<link>%(link)s</link> <link>%(link)s</link>
@ -81,6 +118,13 @@ def __append_non_itunes(entry, output, suffix, base_url):
def __append_itunes(entry, output, suffix, base_url): def __append_itunes(entry, output, suffix, base_url):
"""
For the iTunes/AAC feed, there are some additional elements that make
use of the Apple extensions to RSS. Some of these, like the duration,
author and subtitle, can be copied as is. The description and summary
produced by PodPress is less than desirable so those get munged to
something more suitable before writing into the iTunes feed.
"""
description = __description(entry.content) description = __description(entry.content)
soup = BeautifulSoup(description) soup = BeautifulSoup(description)
summary = '\n\n'.join([''.join(p.findAll(text=True)) for p in soup.findAll('p')]) summary = '\n\n'.join([''.join(p.findAll(text=True)) for p in soup.findAll('p')])
@ -115,6 +159,13 @@ def __append_itunes(entry, output, suffix, base_url):
def __permalink(title): def __permalink(title):
"""
PodPress uses the opaque permalink from WordPress, basically just a
base url with a minimal query string containing the post's internal ID.
The OS X app used to maintain these feeds previously, Feeder, munged
the title into a nice, readable slug. This function reproduces what
Feed does to populate the permalink element in the feed entry.
"""
permalink = title.lower() permalink = title.lower()
permalink = re.sub('-', '', permalink) permalink = re.sub('-', '', permalink)
permalink = re.sub('[^a-z0-9]', '-', permalink) permalink = re.sub('[^a-z0-9]', '-', permalink)
@ -125,6 +176,15 @@ def __permalink(title):
def __description(content): def __description(content):
"""
This function strips out parts of the description used in the main site
feed that are less appropriate for the media specific feeds. PodPress
leaves a blank paragraph where its Flash player renders. The main
site's episodes have some extra verbiage after the license image and
links, namely the sharing and relate posts plugin output. A simple,
bare link is added to the last paragraph for the benefit of aggregators
that may strip out HTML.
"""
description = content[0].value description = content[0].value
description = re.sub('<p></p>\n', '', description) description = re.sub('<p></p>\n', '', description)
description = re.sub(re.compile('License</a>.</p>.*$', re.M | re.S), 'License</a>.</p>', description) description = re.sub(re.compile('License</a>.</p>.*$', re.M | re.S), 'License</a>.</p>', description)
@ -133,10 +193,19 @@ def __description(content):
def __enclosure(enclosures, base_url, suffix): def __enclosure(enclosures, base_url, suffix):
"""
Uses the file name from the main site's enclosure plus the base_url to
pull together values to re-write the attributes for the correct media.
"""
m = re.search('cmdln.net_[0-9]{4}-[0-9]{2}-[0-9]{2}', enclosures[0].href) m = re.search('cmdln.net_[0-9]{4}-[0-9]{2}-[0-9]{2}', enclosures[0].href)
url = '%s/%s.%s' % (base_url, m.group(), suffix) url = '%s/%s.%s' % (base_url, m.group(), suffix)
usock = urllib2.urlopen(url) usock = urllib2.urlopen(url)
mime_type = usock.info().type # Google listen won't play 'application/ogg' and that mime type is currently
# returned by archive.org for Ogg Vorbis files
if 'ogg' == suffix:
mime_type = 'audio/ogg'
else:
mime_type = usock.info().type
size = usock.info().get('Content-Length') size = usock.info().get('Content-Length')
if size is None: if size is None:
size = 0 size = 0
@ -144,6 +213,11 @@ def __enclosure(enclosures, base_url, suffix):
def __archive_slug(title): def __archive_slug(title):
"""
The Internet Archive transforms the title field of new entries in a
specific way. This is my reverse engineering of their algorithm based
on their description and empirical data from dozens of uploads.
"""
slug = re.sub('\([^0-9]\)-\([^0-9]\)', '\1\2', title) slug = re.sub('\([^0-9]\)-\([^0-9]\)', '\1\2', title)
slug = re.sub('[^A-Za-z0-9\-]', ' ', slug) slug = re.sub('[^A-Za-z0-9\-]', ' ', slug)
slug = re.sub(' {2,}', ' ', slug) slug = re.sub(' {2,}', ' ', slug)
@ -153,9 +227,11 @@ def __archive_slug(title):
return slug return slug
def main(): def __main():
logging.basicConfig(level=logging.INFO, logging.basicConfig(level=logging.INFO,
format='%(message)s') format='%(message)s')
# pulls the category feed from the web site which will have just the most recent episodes
# along with all the iTunes jiggery-pokery PodPress performs
feed = __fetch_feed('http://thecommandline.net/category/podcast/feed/') feed = __fetch_feed('http://thecommandline.net/category/podcast/feed/')
if feed is None: if feed is None:
logging.error('Failed to fetch feed.') logging.error('Failed to fetch feed.')
@ -164,8 +240,9 @@ def main():
__append(feed, 'mp3', __append_non_itunes) __append(feed, 'mp3', __append_non_itunes)
__append(feed, 'ogg', __append_non_itunes) __append(feed, 'ogg', __append_non_itunes)
__append(feed, 'm4a', __append_itunes) __append(feed, 'm4a', __append_itunes)
# TODO add flac
if __name__ == "__main__": if __name__ == "__main__":
main() __main()