autocast/append.py
2010-09-29 12:32:16 -04:00

248 lines
10 KiB
Python
Executable file

#!/usr/bin/python
#
# append.py - Compares the podcast category feed for the podcast's web site
# with format specific feeds, adding the newest episode if missing.
#
# All project files are made available under the following, BSD-new license.
#
# Copyright (c) 2010, Thomas Gideon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Thomas Gideon nor the
# names of additional contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys
import feedparser
import urllib2
from urllib2 import HTTPError, URLError
import logging
import re
from BeautifulSoup import BeautifulSoup
import shutil
import time
import datetime
def __fetch_feed(url):
""" Pull the feed and parse it, logging any errors. """
try:
return feedparser.parse(url)
except HTTPError, e:
logging.error('Failed with HTTP status code %d' % e.code)
return None
except URLError, e:
logging.error('Failed to connect with network.')
logging.debug('Network failure reason, %s.' % e.reason)
return None
def __append(feed, suffix, append_fn):
"""
For the given main site feed, load the appropriate media specific feed
and compare. If the latest episode isn't in the media specific feed,
insert it making the necessary adjustments to the new episode's entry.
"""
latest = __fetch_feed('cmdln_%s.xml' % suffix).entries[0]
entry = feed.entries[0]
if latest.title.find(entry.title) != -1:
logging.info('%s is up to date.' % suffix)
return
base_url = 'http://www.archive.org/download/%s' % __archive_slug(entry.title)
filename = 'cmdln_%s.xml' % suffix
today = datetime.date.today()
backup = '%s.%s' % (filename, today.strftime('%Y-%m-%d'))
shutil.copy(filename, backup)
f = open(backup)
o = open(filename, 'w')
firstItem = False
try:
updated = time.strftime('%a, %d %b %Y %X +0000', feed.updated)
for line in f:
if line.find('<item>') != -1 and not firstItem:
append_fn(entry, o, suffix, base_url)
firstItem = True
if line.startswith(' <pubDate>'):
line = ' <pubDate>%s</pubDate>\n' % updated
if line.startswith(' <lastBuildDate>'):
line = ' <lastBuildDate>%s</lastBuildDate>\n' % updated
o.write(line)
finally:
f.close()
o.close()
def __append_non_itunes(entry, output, suffix, base_url):
"""
For most of the feeds, new episodes are simple stanzas and the
adjustments consist mostly of copying what is in the mean site feed's
entry and just re-writing the enclosure to the appropriate media file.
"""
(url, mime_type, size) = __enclosure(entry.enclosures, base_url, suffix)
output.write(""" <item>
<title>%(title)s (Comment Line 240-949-2638)</title>
<link>%(link)s</link>
<description><![CDATA[%(description)s]]></description>
<pubDate>%(pubDate)s</pubDate>
<enclosure url="%(url)s" length="%(size)s" type="%(mime_type)s"/>
<guid isPermaLink="false">%(permalink)s</guid>
</item>
""" % { 'title': entry.title,
'link': entry.link,
'description': __description(entry.content),
'pubDate' : entry.date,
'permalink' : __permalink(entry.title),
'url' : url,
'mime_type' : mime_type,
'size' : size })
logging.info('Inserted new %s item.' % suffix)
def __append_itunes(entry, output, suffix, base_url):
"""
For the iTunes/AAC feed, there are some additional elements that make
use of the Apple extensions to RSS. Some of these, like the duration,
author and subtitle, can be copied as is. The description and summary
produced by PodPress is less than desirable so those get munged to
something more suitable before writing into the iTunes feed.
"""
description = __description(entry.content)
soup = BeautifulSoup(description)
summary = '\n\n'.join([''.join(p.findAll(text=True)) for p in soup.findAll('p')])
(url, mime_type, size) = __enclosure(entry.enclosures, base_url, suffix)
if size == 0:
raise Exception('Couldn not find media, %s.' % base_url)
output.write(""" <item>
<title>%(title)s (Comment Line 240-949-2638)</title>
<link>%(link)s</link>
<description><![CDATA[%(description)s]]></description>
<pubDate>%(pubDate)s</pubDate>
<enclosure url="%(url)s" length="%(size)s" type="%(mime_type)s"/>
<guid isPermaLink="false">%(permalink)s</guid>
<itunes:author>Thomas Gideon</itunes:author>
<itunes:subtitle>%(subtitle)s</itunes:subtitle>
<itunes:summary>%(summary)s</itunes:summary>
<itunes:explicit>no</itunes:explicit>
<itunes:duration>%(duration)s</itunes:duration>
</item>
""" % { 'title': entry.title,
'link': entry.link,
'description': description,
'pubDate' : entry.date,
'permalink' : __permalink(entry.title),
'url' : url,
'mime_type' : mime_type,
'size' : size,
'subtitle' : ''.join(soup.contents[0].findAll(text = True)),
'summary' : summary,
'duration' : entry.itunes_duration })
logging.info('Inserted new %s item.' % suffix)
def __permalink(title):
"""
PodPress uses the opaque permalink from WordPress, basically just a
base url with a minimal query string containing the post's internal ID.
The OS X app used to maintain these feeds previously, Feeder, munged
the title into a nice, readable slug. This function reproduces what
Feed does to populate the permalink element in the feed entry.
"""
permalink = title.lower()
permalink = re.sub('-', '', permalink)
permalink = re.sub('[^a-z0-9]', '-', permalink)
permalink = re.sub('-{2,}', '-', permalink)
if len(permalink) > 48:
permalink = permalink[:48]
return permalink
def __description(content):
"""
This function strips out parts of the description used in the main site
feed that are less appropriate for the media specific feeds. PodPress
leaves a blank paragraph where its Flash player renders. The main
site's episodes have some extra verbiage after the license image and
links, namely the sharing and relate posts plugin output. A simple,
bare link is added to the last paragraph for the benefit of aggregators
that may strip out HTML.
"""
description = content[0].value
description = re.sub('<p></p>\n', '', description)
description = re.sub(re.compile('License</a>.</p>.*$', re.M | re.S), 'License</a>.</p>', description)
description = re.sub('</p>\n', '</p>\n\n', description)
return re.sub('<p>View the <a', '<p>More news, commentary, and alternate feeds available at http://thecommandline.net/. View the <a', description)
def __enclosure(enclosures, base_url, suffix):
"""
Uses the file name from the main site's enclosure plus the base_url to
pull together values to re-write the attributes for the correct media.
"""
m = re.search('cmdln.net_[0-9]{4}-[0-9]{2}-[0-9]{2}', enclosures[0].href)
url = '%s/%s.%s' % (base_url, m.group(), suffix)
usock = urllib2.urlopen(url)
# Google listen won't play 'application/ogg' and that mime type is currently
# returned by archive.org for Ogg Vorbis files
if 'ogg' == suffix:
mime_type = 'audio/ogg'
else:
mime_type = usock.info().type
size = usock.info().get('Content-Length')
if size is None:
size = 0
return (url, mime_type, size)
def __archive_slug(title):
"""
The Internet Archive transforms the title field of new entries in a
specific way. This is my reverse engineering of their algorithm based
on their description and empirical data from dozens of uploads.
"""
slug = re.sub('\([^0-9]\)-\([^0-9]\)', '\1\2', title)
slug = re.sub('[^A-Za-z0-9\-]', ' ', slug)
slug = re.sub(' {2,}', ' ', slug)
tokens = slug.split(' ')
tokens = [t.capitalize() for t in tokens]
slug = ''.join(tokens)
return slug
def __main():
logging.basicConfig(level=logging.INFO,
format='%(message)s')
# pulls the category feed from the web site which will have just the most recent episodes
# along with all the iTunes jiggery-pokery PodPress performs
feed = __fetch_feed('http://thecommandline.net/category/podcast/feed/')
if feed is None:
logging.error('Failed to fetch feed.')
sys.exit(1)
__append(feed, 'mp3', __append_non_itunes)
__append(feed, 'ogg', __append_non_itunes)
__append(feed, 'm4a', __append_itunes)
# TODO add flac
if __name__ == "__main__":
__main()