1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#! /usr/bin/env python
'''
Minimal script to dump content from a m.tou.tv AkamaiGHost back-end based on an
intercepted m3u playlist, as captured by proxy.py (i.e. a three part HTTP
request headers, HTTP response headers, and m3u playlist, each separated by one
empty line).

Requires cPython 2.7.x and the Requests >= 0.14.2 library, as well as openssl
in your PATH (tested on openssl 1.0.0c) for decryption.

Invoke with --help to see the available options. Please note that it is possible
to interrupt a dump (using ^C for instance), as the script has download resuming
capabilities built-in.

Sylvain <fourmanoit@gmail.com>, 2012 - 2013.

History
-------
2013-01-29: patch a bug in highest bandwidth determination in primary playlist;
            thanks to Kristoffer Laurin-Racicot for spotting the problem
            and providing the fix.
2013-01-12: add support for requests 1.x branch, make simplifications
            for supporting Mac OS X, based on feedback by chesstu p.
2012-12-17: add estimated time of arrival computation.
2012-12-16: correct a fetching bug with up-to-date version of Requests;
            thanks to Patrick St-Onge for his assistance in pinpointing it,
            and to Kenneth Reitz for taking the time to tell me how to
            properly use the raw interface on newer releases of Requests.
2012-12-15: initial release.
'''
import re, os, os.path, subprocess, shutil, datetime, \
       functools, contextlib, json, logging
import requests

class ETA(list):
    '''
    Minimalistic estimated time of arrival computation
    '''
    def update(self, completion_ratio):        
        assert(len(self) == 0 or completion_ratio > self[-1][0])
        self.append((completion_ratio, datetime.datetime.now()))
        return self.compute()

    def compute(self):
        if len(self) < 2: return
        start = self[0]
        deltas = [(item[1] - start[1]).total_seconds() / (item[0] - start[0])
                  for item in self[:0:-1]][:5]
        return start[1] + datetime.timedelta(
            seconds = (sum(deltas) / len(deltas)))
        
def get_requests_numerical_version():
    '''
    Compute a numerical value out of requests.__version__.
    For instance, "0.14.2" yield 1402
    '''
    return sum((int(c) * (100**idx))
               for idx, c in enumerate(
                   reversed(requests.__version__.split('.'))))

@contextlib.contextmanager
def get_states(path, default):
    logging.info('loading states from "%s"...' % path)
    try:
        data = json.load(open(path))
    except (IOError, ValueError):
        data = default
    log_save = False
    try:
        yield data
    except:
        log_save = True
        raise
    finally:
        if log_save is True:
            logging.info('saving states to "%s"...' % path)
        with open(path, 'w') as out:
            json.dump(data, out, indent = 4)

def parse_primary_playlist(payload):
    '''
    Parse the first level playlist, containing the various quality streams, and
    return both a reusable url fetcher function with all the needed headers and
    cookies, then the url of the second level playlist for the available stream
    with the highest bandwith.
    '''
    logging.info('Parsing primary playlist...')
    request, response, m3u =  re.compile('^\r$', re.M).split(payload)
    headers = dict(header.rsplit(': ') for header in request.split('\r\n')[1:]
                   if len(header) > 0)
    cookies = dict(m.group(1).split('=', 1)
                   for m in (re.match('Set-Cookie: ([^;]*);', header)
                             for header in response.split('\r\n'))
                   if m is not None)
    best_stream_url = max(
        (stream.split('\n', 1)
         for stream in re.compile('^#EXT-X-STREAM-INF:', re.M).split(m3u)[1:]),
        key = lambda s: int(dict(kv.split('=', 1)
                             for kv in s[0].split(',') if '=' in kv).get(
            'BANDWIDTH', 0)))[1].strip()
    print best_stream_url
    return (functools.partial(requests.get,
                              headers = headers,
                              cookies = cookies),
            best_stream_url)

def parse_secondary_playlist(payload):
    '''
    Parse the second level playlist, and returns the location of the AES-128
    decryption key as well as the sequence of urls for the individual video
    segments.
    '''
    logging.info('Parsing secondary playlist...')
    crypt_key_url = re.compile(
        '#EXT-X-KEY:.*URI="([^"].*)"$', re.M).search(payload).group(1)
    segment_urls = [line.strip() for line in r.text.split('\n')
                    if line.startswith('http://')]
    return crypt_key_url, segment_urls

def concatenize(sources, destination):
    with open(destination, 'wb') as out:
        for source in sources:
            shutil.copyfileobj(open(source, 'rb'), out)
    
def decrypt_stream(input_path, output_path, crypt_key):
    '''
    Invoke openssl on the AES-128 encrypted stream.

    See http://tools.ietf.org/id/draft-pantos-http-live-streaming-08.txt
    '''
    # The initialization vector is always 1, because there is no IV attribute to
    # EXT-X-KEY, thus we use the common sequence number to all segments, which
    # is 1.
    logging.info('Decrypting stream to "%s"...' % output_path)
    args = ('openssl aes-128-cbc -d -in %s -out %s -nosalt '
            '-iv 00000000000000000000000000000001 -K %s') % (
                input_path, output_path, crypt_key)
    logging.info('Invoking "%s"...' % args)
    r = subprocess.Popen(args.split()).wait()
    if r == 0:
        logging.info('Decryption success')
    else:
        logging.info('Decryption failure')
    return
    

if __name__ == '__main__':
    import ConfigParser, argparse, sys

    # Parse (or create) a config file
    def get_option(conf, section, option, default = None):
        try:
            return conf.get(section, option)
        except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
            return default
        
    conf       = ConfigParser.ConfigParser()
    conf_path  = os.path.expanduser('~/.m3udump')
    conf_found = len(conf.read(conf_path)) > 0
    
    basedir  = get_option(conf, 'General', 'basedir' , '/tmp')
    basename = get_option(conf, 'General', 'basename', 'out')
    
    if not conf_found:
        conf.add_section('General')
        conf.set('General', 'basedir' , basedir)
        conf.set('General', 'basename', basename)
        with open(conf_path, 'w') as cf:
            conf.write(cf)
    
    # Expose a minimalistic CLI
    p = argparse.ArgumentParser(
        description = ('Dump content out of Akamai for m.tou.tv content '
                       'based on an intercepted m3u playlist.'))
    p.add_argument('playlist', default = None, nargs='?',
                   help = ('primary playlist path from proxy.py '
                           '(if not given, stdin is assumed)'))
    p.add_argument('-f', '--force', default = False, action='store_true',
                   help = ('force the reset of the states file, '
                           'which means the refetch of the stream description '
                           'and all segments'))
    p.add_argument('--basedir', default = basedir,
                   help = ('set the default output directory '
                           '(default to "%s", if set to empty string, use '
                           'the current working directory)') % basedir)
    p.add_argument('--basename', default = basename,
                   help = ('set the default basename prefix used when '
                           'outputting files (default to "%s")') % basename)
    p.add_argument('-v', '--verbose', default = False, action="store_true",
                   help = 'make script more verbose')
    
    args = p.parse_args()

    # Check requests version
    requests_version = get_requests_numerical_version()
    if requests_version < 1402:
        p.error(
            'installed version or Requests (%s) should be >= 0.14.2 ' %
            requests.__version__)

    def build_path(suffix):
        return os.path.join(args.basedir or os.getcwd(),
                            '_'.join((args.basename, suffix)))

    # Setup logging
    logging.basicConfig(format = '%(asctime)-15s (%(levelname)s) %(message)s',
                        level = logging.DEBUG if args.verbose else logging.INFO)

    # Start processing...
    with get_states(build_path('states.json'), [None, []]) as states:
        # states is: [crypt_key, [[segment1_url, reception_state], ...]]
        if args.force:
            logging.info('Forcing refetch...')
            states[0] = None; states[1] = []
            
        # Primary playlist processing
        url_get, best_stream_url = parse_primary_playlist(
            file(args.playlist).read() if args.playlist is not None
            else sys.stdin.read())

        # Wrapper for supporting the change in calling convention
        if requests_version < 10000:
            url_get_streaming = functools.partial(url_get, prefetch = False)
        else:
            url_get_streaming = functools.partial(url_get, stream = True)

        # As needed, secondary playlist processing
        if states[0] is None:
            # Fetch, then parse secondary playlist
            logging.info('Fetching secondary playlist (for best stream)...')
            r = url_get(best_stream_url)
            if r.status_code != 200:
                raise RuntimeError('could not fetch stream description')
            
            with open(build_path('stream.m3u'), 'w') as out: out.write(r.text)
            crypt_key_url, segment_urls = parse_secondary_playlist(r.text)

            # Retrieve the crypt key
            logging.info('Fetching the decryption key...')
            states[0] = ''.join('%.2x' % ord(c) for c in url_get_streaming(
                crypt_key_url).raw.read(16))

            # Reset the segments states
            states[1] = [[segment_url, False] for segment_url in segment_urls]

        # Download the segments in sequence
        def build_segment_path(idx): return build_path('segment_%.3d.ts' % idx)
        n = len(states[1])
        eta = ETA()
        for idx, (segment_url, segment_received) in enumerate(states[1]):
            if segment_received: continue
            logging.info('Fetching segment %d/%d...' % (idx + 1, n))
            download = url_get_streaming(segment_url).raw
            with open(build_segment_path(idx), 'wb') as out:
                while True:
                    print '.',
                    sys.stdout.flush()
                    chunk = download.read(1024 * 64)
                    if len(chunk) == 0:
                        break
                    out.write(chunk)
                print
            ceta = eta.update(float(idx + 1) / n)
            if ceta is not None: logging.info('ETA: %s' % ceta)
            states[1][idx][1] = True

        # Contatenize the crypted segments together
        logging.info('Concatenizing the %d segments...' % n)
        crypted = build_path('crypted.ts')
        concatenize(sources = (build_segment_path(idx) for idx in xrange(n)),
                    destination = crypted)

        # Then, finally, decrypt it.
        final = build_path('final.ts')
        decrypt_stream(crypted, final, crypt_key = states[0])
        logging.info('Done: dumped stream stored in "%s".' % final)