autocast/relink.py

#!/usr/bin/python
#
# relink.py - A script intended for one time use to tweak a feed to re-link its
# enclosures to appropriate URLs at the Internet Archive.
#
# Copyright (c) 2010, Thomas Gideon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of Thomas Gideon nor the
#       names of additional contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from BeautifulSoup import BeautifulStoneSoup
import re
import urllib2
from urllib2 import HTTPError, URLError
import os.path
import logging
import shutil
import datetime


# TODO this could be re-worked to help re-write one of the existing feeds to produce an initial flac feed
def __repoint():
    """ Iteratest through the feed, re-writing the enclosures. """
    logging.basicConfig(level=logging.INFO,
            format='%(message)s')
    today = datetime.date.today()
    filename = 'cmdln_m4a.xml'
    backup = '%s.%s' % (filename, today.strftime('%Y-%m-%d'))
    shutil.copy(filename, backup)
    f = open(backup)
    o = open(filename, 'w')
    try:
        soup = BeautifulStoneSoup(f)
        enclosures = soup.findAll('enclosure')
        for enclosure in enclosures:
            url = enclosure['url']
            if url.find('archive.org') != -1:
                continue
            title = enclosure.findPreviousSibling('title')
            rewritten = 'http://www.archive.org/download/%s/%s' % (__archive_slug(title.string),
                os.path.basename(url))
            (mime_type, length) =  __url_info(rewritten, enclosure['type'], enclosure['length'])
            if mime_type is None:
                print 'Could not find media, %s.' % rewritten
                break
            enclosure['url'] = rewritten
            enclosure['type'] = mime_type
            enclosure['length'] = length
        o.write(str(soup))
    finally:
        f.close()
        o.close()


def __archive_slug(title):
    paren = title.find('(')
    slug = title[:paren -1]
    slug = re.sub('\([^0-9]\)-\([^0-9]\)', '\1\2', slug)
    slug = re.sub('[^A-Za-z0-9\-]', ' ', slug)
    slug = re.sub(' {2,}', ' ', slug)
    tokens = slug.split(' ')
    tokens = [t.capitalize() for t in tokens]
    slug = ''.join(tokens)
    return slug

def __url_info(url, mime_type, length):
    try:
        usock = urllib2.urlopen(url)
        if usock.info() is None:
            return (None, None)
        mime_type = usock.info().type
        length = usock.info().get('Content-Length')
        if length is None:
            return (mime_type, None)
        return (mime_type, length)
    except HTTPError, e:
        logging.error('Failed with HTTP status code %d' % e.code)
        return (None, None)
    except URLError, e:
        logging.error('Failed to connect with network.')
        return (None, None)


if __name__ == "__main__":
    __repoint()