A continuation of my WARCings.

Writing WARCs with warcio

The following code logs into a website and uses warcio to write logged-in pages to a WARC file. This would be especially useful for capturing basic webpages / media files. It will have to be improved in order to capture embedded resources - this is probably relatively straightforward though.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Authenticated WARC capture. Given a .txt file of URLS, this will login to a website 
and write the logged in pages into a WARC file."""

import time

from bs4 import BeautifulSoup
from warcio import WARCWriter
from warcio.capture_http import capture_http
import requests  # requests must be imported after capture_http

# Login credentials
USERNAME = 'craiglmccarthy'
PASSWORD = 'password'
LOGIN_PAGE = 'https://loginpage.com'

# A .txt file containing URLs to be added to WARC file
URLS_TO_WARC = 'exampleURLs.txt'
WARC_NAME = 'output.warc.gz'

# Login credentials, make empty to get public view of website
payload = {
    'Username': USERNAME,
    'Password': PASSWORD,
}

# Load the .txt file and build list
with open(URLS_TO_WARC, 'r') as f:
    lines_file = [line.strip() for line in f]
print('Number of URLs to be added to WARC:', len(lines_file), '\n')
time.sleep(3)

# Start capture session
with capture_http(WARC_NAME):
    # Start requests session---------------------------------------------------
    with requests.Session() as sess:
      # Set headers / emulate a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                    Chrome/50.0.2661.102 Safari/537.36'}
        res = sess.get(LOGIN_PAGE, headers=headers)
        # Get validation token by parsing HTML
        signin = BeautifulSoup(res.text, 'html.parser')
        tok = signin.find_all('input')
        # Add token to payload
        payload['__RequestVerificationToken'] = tok[-1]["value"]
        # Submit login details to login page
        res = sess.post(LOGIN_PAGE,
                        data=payload, headers=headers)

        # Request URLs in the context of the WARC capture----------------------
        for i in lines_file:
            print('Getting..', i)
            sess.get(i)