2 minutes
WARCing up the wrong tree - Part II
A continuation of my WARCings.
Writing WARCs with warcio
The following code logs into a website and uses warcio to write logged-in pages to a WARC file. This would be especially useful for capturing basic webpages / media files. It will have to be improved in order to capture embedded resources - this is probably relatively straightforward though.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Authenticated WARC capture. Given a .txt file of URLS, this will login to a website
and write the logged in pages into a WARC file."""
import time
from bs4 import BeautifulSoup
from warcio import WARCWriter
from warcio.capture_http import capture_http
import requests # requests must be imported after capture_http
# Login credentials
USERNAME = 'craiglmccarthy'
PASSWORD = 'password'
LOGIN_PAGE = 'https://loginpage.com'
# A .txt file containing URLs to be added to WARC file
URLS_TO_WARC = 'exampleURLs.txt'
WARC_NAME = 'output.warc.gz'
# Login credentials, make empty to get public view of website
payload = {
'Username': USERNAME,
'Password': PASSWORD,
}
# Load the .txt file and build list
with open(URLS_TO_WARC, 'r') as f:
lines_file = [line.strip() for line in f]
print('Number of URLs to be added to WARC:', len(lines_file), '\n')
time.sleep(3)
# Start capture session
with capture_http(WARC_NAME):
# Start requests session---------------------------------------------------
with requests.Session() as sess:
# Set headers / emulate a real browser
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/50.0.2661.102 Safari/537.36'}
res = sess.get(LOGIN_PAGE, headers=headers)
# Get validation token by parsing HTML
signin = BeautifulSoup(res.text, 'html.parser')
tok = signin.find_all('input')
# Add token to payload
payload['__RequestVerificationToken'] = tok[-1]["value"]
# Submit login details to login page
res = sess.post(LOGIN_PAGE,
data=payload, headers=headers)
# Request URLs in the context of the WARC capture----------------------
for i in lines_file:
print('Getting..', i)
sess.get(i)