#!/usr/bin/env python
"""
This script expects a URL of a flash embedding page from arte plus 7.
It will download the movie to <base name of page>.flv.

I have no idea whether the tracking pixel magic is necessary, but it *seems*
the server will kick you out if you don't retrieve it regularly.

The mess with rtmpdump bailing now and then appears to be a known bug;
however, everything I've seen in the way of explanations on the web seems
quite bogus, so I'm only offering the loop workaround, too.
"""

import BeautifulSoup
import cgi
import os
import re
import sys
import subprocess
import time
import urllib2
import urlparse

# set to true for development
CACHE_RESULTS = False

# sd or hd
QUALITY = "sd"

# Change to whatever you like
USER_AGENT = "Mozilla 5.0 (compatible; Mosaic 7.3 (CP/M))"


def getWithCache(url, bypassCache=False, extraHeaders={}):
	cacheName = re.sub("[^\w]+", "", url)+".cache"
	if not bypassCache and CACHE_RESULTS and os.path.exists(cacheName):
		doc = open(cacheName).read()
	else:
		headers = {"User-Agent": USER_AGENT}
		headers.update(extraHeaders)
		req = urllib2.Request(url, headers=headers)
		f = urllib2.urlopen(req)
		doc = f.read()
		f.close()
		if CACHE_RESULTS:
			f = open(cacheName, "w")
			f.write(doc)
			f.close()
	return doc


def retrieveStream(movieURL, destName, 
		streamURL, trackingURL, trackingReferrer):
	for retry in range(2):
		subProc = subprocess.Popen(["rtmpdump", 
			"--swfVfy", str(movieURL),
			"-k", "2", 
			"-r", str(streamURL),
			'-o', destName, "-e"])
		while subProc.poll() is None:
			getWithCache(trackingURL, bypassCache=True,
				extraHeaders={"referrer": trackingReferrer})
			time.sleep(2)
		if subProc.returncode==0:
			break
		print "rtmpdump failed, trying again."


def getStreamParameters(embeddingURL):
	"""returns tuple suitable as arguments to retrieveStream for getting
	the video behind embeddingURL
	"""
	# step 1: obtain first XML from embedding HTML
	destName = os.path.splitext(embeddingURL.split("/")[-1])[0]+".flv"
	soup = BeautifulSoup.BeautifulSoup(getWithCache(sys.argv[1]))
	movieURL = soup.findAll("param", attrs={"name":"movie"})[0]["value"]
	videoURL = cgi.parse_qs(urlparse.urlparse(movieURL).query
		)["videorefFileUrl"][0]

	# step 2: obtain second XML from first XML
	nxDoc = getWithCache(videoURL)
	nxURL = BeautifulSoup.BeautifulStoneSoup(nxDoc).findAll("video", 
		lang="de")[0]["ref"]

	# step 3: obtain stream parameter from second XML
	nxDoc = getWithCache(nxURL)
	linkSoup = BeautifulSoup.BeautifulStoneSoup(nxDoc)
	streamURL = linkSoup.findAll("url", quality=QUALITY)[0].contents[0]
	trackingURL = linkSoup.findAll("tracking")[0]["url"]
	trackingReferrer = linkSoup.findAll("trackingreferer")[0]["url"]
	# rtmpdump could do with a flv hash.  However, there seems to
	# be no way of figuring it out.
	# hash = cgi.parse_qs(streamURL.split("?")[-1])["h"][0]
	return movieURL, destName, streamURL, trackingURL, trackingReferrer


def main():
	if len(sys.argv)!=2:
		print "Usage: %s <arte+7 url>"%sys.argv[0]
		sys.exit(1)
	videoPars = getStreamParameters(sys.argv[1])
	retrieveStream(*videoPars)

if __name__=="__main__":
	main()

