mitmproxy.har_dump.py Maven / Gradle / Ivy
The newest version!
import json
import base64
import typing
import tempfile
from datetime import datetime
from datetime import timezone
import dateutil.parser
import copy
import asyncio
from mitmproxy import ctx
from enum import Enum, auto
import falcon
import time
from mitmproxy import connections
from mitmproxy.utils import strutils
from mitmproxy.net.http import cookies
# A list of server seen till now is maintained so we can avoid
# using 'connect' time for entries that use an existing connection.
SERVERS_SEEN: typing.Set[connections.ServerConnection] = set()
DEFAULT_PAGE_REF = "Default"
DEFAULT_PAGE_TITLE = "Default"
REQUEST_SUBMITTED_FLAG = "_request_submitted"
class HarCaptureTypes(Enum):
REQUEST_HEADERS = auto()
REQUEST_COOKIES = auto()
REQUEST_CONTENT = auto()
REQUEST_BINARY_CONTENT = auto()
RESPONSE_HEADERS = auto()
RESPONSE_COOKIES = auto()
RESPONSE_CONTENT = auto()
RESPONSE_BINARY_CONTENT = auto()
REQUEST_CAPTURE_TYPES = {
REQUEST_HEADERS,
REQUEST_CONTENT,
REQUEST_BINARY_CONTENT,
REQUEST_COOKIES}
RESPONSE_CAPTURE_TYPES = {
RESPONSE_HEADERS,
RESPONSE_CONTENT,
RESPONSE_BINARY_CONTENT,
RESPONSE_COOKIES}
HEADER_CAPTURE_TYPES = {
REQUEST_HEADERS,
RESPONSE_HEADERS}
NON_BINARY_CONTENT_CAPTURE_TYPES = {
REQUEST_CONTENT,
RESPONSE_CONTENT}
BINARY_CONTENT_CAPTURE_TYPES = {
REQUEST_BINARY_CONTENT,
RESPONSE_BINARY_CONTENT}
ALL_CONTENT_CAPTURE_TYPES = {
REQUEST_CONTENT, RESPONSE_CONTENT,
REQUEST_BINARY_CONTENT,
RESPONSE_BINARY_CONTENT}
COOKIE_CAPTURE_TYPES = {
REQUEST_COOKIES,
RESPONSE_COOKIES}
class HarDumpAddonResource:
def addon_path(self):
return "har"
def __init__(self, harDumpAddOn):
self.name = "hardump"
self.harDumpAddOn = harDumpAddOn
def on_get(self, req, resp, method_name):
try:
asyncio.get_event_loop()
except:
asyncio.set_event_loop(asyncio.new_event_loop())
getattr(self, "on_" + method_name)(req, resp)
def on_get_har(self, req, resp):
clean_har = req.get_param('cleanHar') == 'true'
har = self.harDumpAddOn.get_har(clean_har)
filtered_har = self.harDumpAddOn.filter_har_for_report(har)
har_file = self.harDumpAddOn.save_har(filtered_har)
if clean_har:
self.harDumpAddOn.mark_har_entries_submitted(har)
resp.status = falcon.HTTP_200
resp.content_type = falcon.MEDIA_JSON
resp.text = json.dumps({
"path": har_file.name,
"json": filtered_har
}, ensure_ascii=False)
def on_new_har(self, req, resp):
page_ref = req.get_param('pageRef')
page_title = req.get_param('pageTitle')
har = self.harDumpAddOn.new_har(page_ref, page_title, True)
har_file = self.harDumpAddOn.save_har(har)
resp.status = falcon.HTTP_200
resp.text = json.dumps({
"path": har_file.name,
"json": har
}, ensure_ascii=False)
def on_end_har(self, req, resp):
har = self.harDumpAddOn.end_har()
har_file = self.harDumpAddOn.save_har(har)
resp.status = falcon.HTTP_200
resp.text = json.dumps({
"path": har_file.name,
"json": har
}, ensure_ascii=False)
def on_new_page(self, req, resp):
page_ref = req.get_param('pageRef')
page_title = req.get_param('pageTitle')
har = self.harDumpAddOn.new_page(page_ref, page_title)
har_file = self.harDumpAddOn.save_har(har)
resp.status = falcon.HTTP_200
resp.text = json.dumps({
"path": har_file.name,
"json": har
}, ensure_ascii=False)
def on_end_page(self, req, resp):
har = self.harDumpAddOn.end_page()
har_file = self.harDumpAddOn.save_har(har)
resp.status = falcon.HTTP_200
resp.text = json.dumps({
"path": har_file.name,
"json": har
}, ensure_ascii=False)
def on_set_har_capture_types(self, req, resp):
capture_types = req.get_param('captureTypes')
capture_types = capture_types.strip("[]").split(",")
capture_types_parsed = []
for ct in capture_types:
ct = ct.strip(" ")
if ct == "":
break
if not hasattr(HarCaptureTypes, ct):
resp.status = falcon.HTTP_400
resp.text = "Invalid HAR Capture type"
return
capture_types_parsed.append(HarCaptureTypes[ct])
self.harDumpAddOn.har_capture_types = capture_types_parsed
resp.status = falcon.HTTP_200
class HarDumpAddOn:
def __init__(self):
self.num = 0
self.har = None
self.har_page_count = 0
self.har_capture_types = []
self.current_har_page = None
self.dns_resolution_started_nanos = 0
self.connection_started_nanos = 0
self.send_started_nanos = 0
self.send_finished_nanos = 0
self.response_receive_started_nanos = 0
self.http_connect_timings = {}
def get_har_entry(self, flow):
return flow.request.har_entry
def is_har_entry_submitted(self, har_entry):
return REQUEST_SUBMITTED_FLAG in har_entry
def har_entry_has_response(self, har_entry):
return bool(har_entry['response'])
def har_entry_clear_request(self, har_entry):
har_entry['request'] = {}
def filter_har_for_report(self, har):
if har is None:
return har
har_copy = copy.deepcopy(har)
entries_to_report = []
for entry in har_copy['log']['entries']:
if self.is_har_entry_submitted(entry):
if self.har_entry_has_response(entry):
del entry[REQUEST_SUBMITTED_FLAG]
self.har_entry_clear_request(entry)
entries_to_report.append(entry)
else:
entries_to_report.append(entry)
har_copy['log']['entries'] = entries_to_report
return har_copy
def mark_har_entries_submitted(self, har):
if har is not None:
for entry in har['log']['entries']:
entry[REQUEST_SUBMITTED_FLAG] = True
def get_har(self, clean_har):
if clean_har:
return self.new_har(DEFAULT_PAGE_REF, DEFAULT_PAGE_TITLE)
return self.har
def get_default_har_page(self):
for hp in self.har['log']['pages']:
if hp['title'] == DEFAULT_PAGE_TITLE:
return hp
return None
def generate_new_har_log(self):
return {
"version": "1.1",
"creator": {
"name": "BrowserUp Proxy",
"version": "0.1",
"comment": ""
},
"entries": [],
"pages": []
}
def generate_new_har(self):
return {
"log": self.generate_new_har_log()
}
def generate_new_page_timings(self):
return {
"onContentLoad": 0,
"onLoad": 0,
"comment": ""
}
def generate_new_har_page(self):
return {
"title": "",
"id": "",
"startedDateTime": "",
"pageTimings": self.generate_new_page_timings()
}
def generate_new_har_post_data(self):
return {
"mimeType": "multipart/form-data",
"params": [],
"text": "plain posted data",
"comment": ""
}
def generate_har_entry_request(self):
return {
"method": "",
"url": "",
"httpVersion": "",
"cookies": [],
"headers": [],
"queryString": [],
"headersSize": 0,
"bodySize": 0,
"comment": "",
"additional": {}
}
def generate_har_timings(self):
return {
"blockedNanos": -1,
"dnsNanos": -1,
"connectNanos": -1,
"sslNanos": -1,
"sendNanos": 0,
"waitNanos": 0,
"receiveNanos": 0,
"comment": ""
}
def generate_har_entry_response(self):
return {
"status": 0,
"statusText": "",
"httpVersion": "",
"cookies": [],
"headers": [],
"content": {
"size": 0,
"compression": 0,
"mimeType": "",
"text": "",
"encoding": "",
"comment": "",
},
"redirectURL": "",
"headersSize": -1,
"bodySize": -1,
"comment": 0,
}
def generate_har_entry_response_for_failure(self):
result = self.generate_har_entry_response()
result['status'] = 0
result['statusText'] = ""
result['httpVersion'] = "unknown"
result['_errorMessage'] = "No response received"
return result
def generate_har_entry(self, request_url):
return {
"pageref": "",
"startedDateTime": "",
"time": 0,
"request": {},
"response": {},
"cache": {},
"timings": self.generate_har_timings(),
"serverIPAddress": "",
"connection": "",
"comment": "",
"_url": request_url
}
def get_or_create_har(self, page_ref, page_title, create_page=False):
if self.har is None:
self.new_har(page_ref, page_title, create_page)
return self.har
def new_page(self, page_ref, page_title):
ctx.log.info(
'Creating new page with initial page ref: {}, title: {}'.
format(page_ref, page_title))
har = self.get_or_create_har(page_ref, page_title, False)
end_of_page_har = None
if self.current_har_page is not None:
current_page_ref = self.current_har_page['id']
self.end_page()
end_of_page_har = self.copy_har_through_page_ref(har,
current_page_ref)
if page_ref is None:
self.har_page_count += 1
page_ref = "Page " + str(self.har_page_count)
if page_title is None:
page_title = page_ref
new_page = self.generate_new_har_page()
new_page['title'] = page_title
new_page['id'] = page_ref
new_page['startedDateTime'] = datetime.utcnow().isoformat()
har['log']['pages'].append(new_page)
self.current_har_page = new_page
return end_of_page_har
def copy_har_through_page_ref(self, har, page_ref):
if har is None:
return None
if har['log'] is None:
return self.generate_new_har()
page_refs_to_copy = []
for page in har['log']['pages']:
page_refs_to_copy.append(page['id'])
if page_ref == page['id']:
break
log_copy = self.generate_new_har_log()
for entry in har['log']['entries']:
if entry['pageref'] in page_refs_to_copy:
log_copy['entries'].append(entry)
for page in har['log']['pages']:
if page['id'] in page_refs_to_copy:
log_copy['pages'].append(page)
har_copy = self.generate_new_har()
har_copy['log'] = log_copy
return har_copy
def get_current_page_ref(self):
har_page = self.current_har_page
if har_page is None:
har_page = self.get_or_create_default_page()
return har_page['id']
def get_or_create_default_page(self):
default_page = self.get_default_page()
if default_page is None:
default_page = self.add_default_page()
return default_page
def add_default_page(self):
self.get_or_create_har(DEFAULT_PAGE_REF, DEFAULT_PAGE_TITLE, False)
new_page = self.generate_new_har_page()
new_page['title'] = DEFAULT_PAGE_REF
new_page['startedDateTime'] = datetime.utcnow().isoformat()
new_page['id'] = DEFAULT_PAGE_REF
self.har['log']['pages'].append(new_page)
return new_page
def get_default_page(self):
for p in self.har['log']['pages']:
if p['id'] == DEFAULT_PAGE_REF:
return p
return None
def new_har(self, initial_page_ref, initial_page_title, create_page=False):
if create_page:
ctx.log.info(
'Creating new har with initial page ref: {}, title: {}'.
format(initial_page_ref, initial_page_title))
else:
ctx.log.info('Creating new har without initial page')
old_har = self.end_har()
self.har_page_count = 0
self.har = self.generate_new_har()
if create_page:
self.new_page(initial_page_ref, initial_page_title)
self.copy_entries_without_response(old_har)
return old_har
def copy_entries_without_response(self, old_har):
if old_har is not None:
for entry in old_har['log']['entries']:
if not self.har_entry_has_response(entry):
self.har['log']['entries'].append(entry)
def end_har(self):
ctx.log.info('Ending current har...')
old_har = self.har
if old_har is None: return
self.end_page()
self.har = None
return old_har
def end_page(self):
ctx.log.info('Ending current page...')
previous_har_page = self.current_har_page
self.current_har_page = None
if previous_har_page is None:
return
if 'startedDateTime' in previous_har_page:
on_load_delta_ms = (datetime.utcnow() - dateutil.parser.isoparse(
previous_har_page['startedDateTime'])).total_seconds() * 1000
previous_har_page['pageTimings']['onLoad'] = int(on_load_delta_ms)
default_har_page = self.get_default_har_page()
if default_har_page is not None:
if 'startedDateTime' in default_har_page:
default_har_page['pageTimings']['onLoad'] = \
(
datetime.utcnow() - dateutil.parser.isoparse(
default_har_page['startedDateTime'])
).total_seconds() * 1000
def add_har_page(self, pageRef, pageTitle):
ctx.log.debug('Adding har page with ref: {} and title: {}'.format(pageRef, pageTitle))
har_page = {
"id": pageRef,
"title:": pageTitle,
"startedDateTime": datetime.utcnow().isoformat(),
"pageTimings": {
"onContentLoad": 0,
"onLoad": 0,
"comment": ""
}
}
self.har['log']['pages'].append(har_page)
return har_page
def get_resource(self):
return HarDumpAddonResource(self)
def save_har(self, har):
json_dump: str = json.dumps(har, indent=2)
tmp_file = tempfile.NamedTemporaryFile(mode="wb", prefix="har_dump_",
delete=False)
raw: bytes = json_dump.encode()
tmp_file.write(raw)
tmp_file.flush()
tmp_file.close()
return tmp_file
def get_full_url(self, request):
host_port = request.host
if request.method == 'CONNECT':
if request.port is not 443:
host_port = host_port + ':' + str(request.port)
host_port = 'https://' + host_port
else:
if request.scheme is not None:
host_port = request.url
else:
host_port = host_port + ":" + str(request.port)
return host_port
def consume_http_connect_timing(self, client_conn):
if client_conn in self.http_connect_timings:
return self.http_connect_timings.pop(client_conn, None)
return None
def populate_har_entry_with_default_response(self, flow):
full_url = self.get_full_url(flow.request)
ctx.log.debug('Populating har entry for request: {}'.format(full_url))
har_entry = self.get_har_entry(flow)
har_entry['pageref'] = self.get_current_page_ref()
har_entry['startedDateTime'] = datetime.fromtimestamp(flow.request.timestamp_start, timezone.utc).isoformat()
har_request = self.generate_har_entry_request()
har_request['method'] = flow.request.method
har_request['url'] = full_url
har_request['httpVersion'] = flow.request.http_version
har_request['queryString'] = self.name_value(flow.request.query or {})
har_request['headersSize'] = len(str(flow.request.headers))
har_entry['request'] = har_request
def append_har_entry(self, har_entry):
har = self.get_or_create_har(DEFAULT_PAGE_REF, DEFAULT_PAGE_TITLE, True)
har['log']['entries'].append(har_entry)
def request(self, flow):
if 'AllowListFiltered' in flow.metadata or 'BlockListFiltered' in flow.metadata:
return
har_entry = self.get_har_entry(flow)
self.populate_har_entry_with_default_response(flow)
req_url = 'none'
if flow.request is not None:
req_url = flow.request.url
ctx.log.debug('Incoming request, url: {}'.format(req_url))
self.get_or_create_har(DEFAULT_PAGE_REF, DEFAULT_PAGE_TITLE, True)
if HarCaptureTypes.REQUEST_COOKIES in self.har_capture_types:
self.capture_request_cookies(flow)
if HarCaptureTypes.REQUEST_HEADERS in self.har_capture_types:
self.capture_request_headers(flow)
if HarCaptureTypes.RESPONSE_CONTENT in self.har_capture_types:
self.capture_request_content(flow)
har_entry['request']['bodySize'] = \
len(flow.request.raw_content) if flow.request.raw_content else 0
connect_timing = self.consume_http_connect_timing(flow)
if connect_timing is not None:
har_entry['timings']['sslNanos'] = connect_timing['sslHandshakeTimeNanos']
har_entry['timings']['connectNanos'] = connect_timing['connectTimeNanos']
har_entry['timings']['blockedNanos'] = connect_timing['blockedTimeNanos']
har_entry['timings']['dnsNanos'] = connect_timing['dnsTimeNanos']
def capture_request_cookies(self, flow):
har_entry = self.get_har_entry(flow)
har_entry['request']['cookies'] = \
self.format_request_cookies(flow.request.cookies.fields)
def capture_request_headers(self, flow):
har_entry = self.get_har_entry(flow)
har_entry['request']['headers'] = \
self.name_value(flow.request.headers)
def capture_request_content(self, flow):
har_entry = self.get_har_entry(flow)
params = [
{"name": a, "value": b}
for a, b in flow.request.urlencoded_form.items(multi=True)
]
har_entry["request"]["postData"] = {
"mimeType": flow.request.headers.get("Content-Type", ""),
"text": flow.request.get_text(strict=False),
"params": params
}
def response(self, flow):
har_entry = self.get_har_entry(flow)
ctx.log.debug('Incoming response for request to url: {}'.format(flow.request.url))
if 'AllowListFiltered' in flow.metadata or 'BlockListFiltered' in flow.metadata:
ctx.log.debug('Block/Allow list filtered, return nothing.')
return
# -1 indicates that these values do not apply to current request
self.get_or_create_har(DEFAULT_PAGE_REF, DEFAULT_PAGE_TITLE, True)
ssl_time = -1
connect_time = -1
if flow.server_conn and flow.server_conn not in SERVERS_SEEN:
connect_time = (flow.server_conn.timestamp_tcp_setup -
flow.server_conn.timestamp_start)
if flow.server_conn.timestamp_tls_setup is not None:
ssl_time = (flow.server_conn.timestamp_tls_setup -
flow.server_conn.timestamp_tcp_setup)
SERVERS_SEEN.add(flow.server_conn)
timings = self.calculate_timings(connect_time, flow, ssl_time)
timings['dnsNanos'] = int(har_entry['timings']['dnsNanos'])
full_time = sum(v for v in timings.values() if v > -1)
# Response body size and encoding
response_body_size = len(
flow.response.raw_content) if flow.response.raw_content else 0
response_body_decoded_size = len(
flow.response.content) if flow.response.content else 0
response_body_compression = response_body_decoded_size - response_body_size
har_response = self.generate_har_entry_response()
har_response["status"] = flow.response.status_code
har_response["statusText"] = flow.response.reason
har_response["httpVersion"] = flow.response.http_version
if HarCaptureTypes.RESPONSE_COOKIES in self.har_capture_types:
har_response["cookies"] = \
self.format_response_cookies(flow.response.cookies.fields)
if HarCaptureTypes.RESPONSE_HEADERS in self.har_capture_types:
har_response["headers"] = self.name_value(flow.response.headers)
if flow.response.status_code in [300, 301, 302, 303, 307]:
har_response['redirectURL'] = flow.response.headers['Location']
content = har_response['content']
content['size'] = response_body_size
content['compression'] = response_body_compression
content['mimeType'] = flow.response.headers.get('Content-Type', '')
if HarCaptureTypes.RESPONSE_CONTENT in self.har_capture_types:
if strutils.is_mostly_bin(flow.response.content):
if HarCaptureTypes.RESPONSE_BINARY_CONTENT in self.har_capture_types:
har_response["content"]["text"] = base64.b64encode(
flow.response.content).decode()
har_response["content"]["encoding"] = "base64"
else:
har_response["content"]["text"] = flow.response.get_text(
strict=False)
har_response["redirectURL"] = flow.response.headers.get('Location', '')
har_response["headersSize"] = len(str(flow.response.headers))
har_response["bodySize"] = response_body_size
har_entry['response'] = har_response
har_entry['time'] = self.nano_to_ms(full_time)
har_entry['pageref'] = self.get_current_page_ref()
har_entry['timings'] = timings
if flow.server_conn.connected():
har_entry["serverIPAddress"] = str(
flow.server_conn.ip_address[0])
ctx.log.debug('Populated har entry for response: {}, entry: {}'.format(flow.request.url, str(har_entry)))
def calculate_timings(self, connect_time, flow, ssl_time):
timings_raw = {
'sendNanos': flow.request.timestamp_end - flow.request.timestamp_start,
'receiveNanos': flow.response.timestamp_end - flow.response.timestamp_start,
'waitNanos': flow.response.timestamp_start - flow.request.timestamp_end,
'connectNanos': connect_time,
'sslNanos': ssl_time,
}
# HAR timings are integers in ms, so we re-encode the raw timings to that format.
# In HAR Timings parser we expect input metrics in Nanos
return {
k: int(self.sec_to_nano(v)) if v != -1 else -1
for k, v in timings_raw.items()
}
def format_cookies(self, cookie_list):
rv = []
for name, value, attrs in cookie_list:
cookie_har = {
"name": name,
"value": value,
}
# HAR only needs some attributes
for key in ["path", "domain", "comment"]:
if key in attrs:
cookie_har[key] = attrs[key]
# These keys need to be boolean!
for key in ["httpOnly", "secure"]:
cookie_har[key] = bool(key in attrs)
# Expiration time needs to be formatted
expire_ts = cookies.get_expiration_ts(attrs)
if expire_ts is not None:
cookie_har["expires"] = datetime.fromtimestamp(expire_ts,
timezone.utc).isoformat()
rv.append(cookie_har)
return rv
def format_request_cookies(self, fields):
return self.format_cookies(cookies.group_cookies(fields))
def format_response_cookies(self, fields):
return self.format_cookies((c[0], c[1][0], c[1][1]) for c in fields)
def name_value(self, obj):
"""
Convert (key, value) pairs to HAR format.
"""
return [{"name": k, "value": v} for k, v in obj.items()]
@staticmethod
def nano_to_ms(time_nano):
return int(time_nano / 1000000)
@staticmethod
def sec_to_nano(time_sec):
return int(time_sec * 1000000000)
addons = [
HarDumpAddOn()
]
© 2015 - 2025 Weber Informatics LLC | Privacy Policy