Skip to content

Commit 71d8ac1

Browse files
committed
Merge branch 'main' into ete4
2 parents fcaf0c1 + 64b6cc6 commit 71d8ac1

File tree

12 files changed

+1116
-3036
lines changed

12 files changed

+1116
-3036
lines changed

.github/workflows/mypy.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
strategy:
1414
fail-fast: false
1515
matrix:
16-
python-version: ["3.10", "3.11", "3.12"]
16+
python-version: ["3.10", "3.11", "3.12", "3.13"]
1717

1818
steps:
1919
- uses: actions/checkout@v4
@@ -26,7 +26,7 @@ jobs:
2626
run: |
2727
git submodule update --init --recursive
2828
pipx install poetry
29-
poetry install
29+
poetry install --with dev --all-extras
3030
3131
- name: Run MyPy
3232
run: |

.github/workflows/pytests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
strategy:
1414
fail-fast: false
1515
matrix:
16-
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
16+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
1717

1818
steps:
1919
- uses: actions/checkout@v4
@@ -26,12 +26,12 @@ jobs:
2626
run: |
2727
git submodule update --init --recursive
2828
pipx install poetry
29-
poetry install
29+
poetry install --with dev --all-extras
3030
3131
- name: Test with pytest
3232
run: |
3333
poetry run pytest --cov=har2tree tests/test.py
3434
poetry run pytest --cov=har2tree tests/simple_test.py
3535
3636
- name: Upload coverage to Codecov
37-
uses: codecov/codecov-action@v4
37+
uses: codecov/codecov-action@v5

.pre-commit-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
exclude: "tests/data"
44
repos:
55
- repo: https://github.com/pre-commit/pre-commit-hooks
6-
rev: v4.5.0
6+
rev: v5.0.0
77
hooks:
88
- id: trailing-whitespace
99
- id: end-of-file-fixer
1010
- id: check-yaml
1111
- id: check-added-large-files
1212
- repo: https://github.com/asottile/pyupgrade
13-
rev: v3.15.1
13+
rev: v3.19.0
1414
hooks:
1515
- id: pyupgrade
16-
args: [--py38-plus]
16+
args: [--py39-plus]

.readthedocs.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
version: 2
22
build:
3-
os: "ubuntu-22.04"
3+
os: "ubuntu-lts-latest"
44
tools:
55
python: "3"
66

7+
sphinx:
8+
configuration: docs/source/conf.py
9+
710
python:
811
install:
912
- method: pip

har2tree/har2tree.py

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,12 @@ def __init__(self, harfile: Path, capture_uuid: str):
159159
# Used to find the root entry of a page in the capture
160160
# NOTE 2020-05-19: Turns out multiple pages can have the exact same timestamp...
161161
self.pages_start_times: dict[str, list[dict[str, Any]]] = defaultdict(list)
162-
for page in self.har['log']['pages']:
163-
self.pages_start_times[page['startedDateTime']].append(page)
164-
# The first entry has a different start time as the one from the list, add that
165-
if self.entries:
166-
self.pages_start_times[self.initial_start_time].append(self.har['log']['pages'][0])
162+
if 'pages' in self.har['log']:
163+
for page in self.har['log']['pages']:
164+
self.pages_start_times[page['startedDateTime']].append(page)
165+
# The first entry has a different start time as the one from the list, add that
166+
if self.entries:
167+
self.pages_start_times[self.initial_start_time].append(self.har['log']['pages'][0])
167168

168169
# Set to false if initial_redirects fails to find the chain.
169170
self.need_tree_redirects = False
@@ -190,7 +191,7 @@ def _search_final_redirect(self) -> None:
190191
self.final_redirect = self.final_redirect.split('?', 1)[0]
191192
self._search_final_redirect()
192193
else:
193-
self.logger.warning(f'Unable to find the final redirect: {self.final_redirect}')
194+
self.logger.info(f'Unable to find the final redirect: {self.final_redirect}')
194195

195196
@property
196197
def number_entries(self) -> int:
@@ -200,7 +201,7 @@ def number_entries(self) -> int:
200201
@property
201202
def initial_title(self) -> str:
202203
"""Title of the first page in the capture"""
203-
if self.har['log']['pages'][0]['title']:
204+
if 'pages' in self.har['log'] and self.har['log']['pages'][0]['title']:
204205
return self.har['log']['pages'][0]['title']
205206
else:
206207
return '!! No title found !!'
@@ -564,7 +565,8 @@ def rendered_node(self) -> URLNode:
564565
if node := self.url_tree.get_first_by_feature('name', self.har.final_redirect, expect_missing=True):
565566
return node
566567

567-
if self.har.final_redirect:
568+
browser_errors = ['chrome-error', 'about:blank']
569+
if self.har.final_redirect and not any(self.har.final_redirect.startswith(r) for r in browser_errors):
568570
self.logger.warning(f'Final redirect URL from adress bar not in tree: {self.har.final_redirect}')
569571
else:
570572
# No final redirect, already logged earlier.
@@ -639,10 +641,10 @@ def make_tree(self) -> URLNode:
639641
if self.har.downloaded_filename and self.har.downloaded_file:
640642
downloaded_file = (self.har.downloaded_filename if self.har.downloaded_filename else '',
641643
self.har.downloaded_file if self.har.downloaded_file else None)
642-
643-
self.rendered_node.add_rendered_features(list(self.all_url_requests.keys()),
644-
rendered_html=self.har.html_content if self.har.html_content else None,
645-
downloaded_file=downloaded_file)
644+
if not self.rendered_node.empty_response:
645+
self.rendered_node.add_rendered_features(list(self.all_url_requests.keys()),
646+
rendered_html=self.har.html_content if self.har.html_content else None,
647+
downloaded_file=downloaded_file)
646648

647649
# Initialize the hostname tree root
648650
self.hostname_tree.add_url(self.url_tree)
@@ -669,26 +671,28 @@ def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
669671

670672
# Sometimes, the har has a list of pages, generally when we have HTTP redirects.
671673
# IF we have more than one page in the list
672-
# AND the orphan node's pageref points to an other page than the first one <= FIXME not enabled yet
674+
# AND the orphan node's pageref points to an other page than the first one
673675
# AND we already have a node in the tree with this pageref
674676
# => attach to that node.
675-
if len(self.har.har['log']['pages']) > 1 and node.pageref != self.har.har['log']['pages'][0] and self.pages_root[node.pageref] != node.uuid:
677+
if ('pages' in self.har.har['log'] and len(self.har.har['log']['pages']) > 1
678+
and node.pageref != self.har.har['log']['pages'][0]
679+
and self.pages_root[node.pageref] != node.uuid):
676680
# In that case, we check if there is already a page with the pageref of the orphan node,
677681
# and attach the node to that. NOTE: we can only do that if there is already a node with this pageref in the tree.
678682
# This node is not a page root, we can attach it \o/
679683
page_root_node = self.get_url_node_by_uuid(self.pages_root[node.pageref])
680684
if dev_debug:
681685
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
682686
self._make_subtree(page_root_node, [node])
683-
elif final_node := self.url_tree.get_first_by_feature('name', self.har.final_redirect, expect_missing=True):
687+
elif final_redirect := self.url_tree.get_first_by_feature('name', self.har.final_redirect, expect_missing=True):
684688
# Generally, when we have a bunch of redirects, they do not branch out before the final landing page
685689
# *but* it is not always the case: some intermediary redirects will have calls to 3rd party pages.
686690
# Hopefully, this last case was taken care of in the branch above.
687691
# In this branch, we get the landing page after the redirects (if any), and attach the node to it.
688692
if dev_debug:
689693
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
690-
self._make_subtree(final_node, [node])
691-
else:
694+
self._make_subtree(final_redirect, [node])
695+
elif 'pages' in self.har.har['log']:
692696
# No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
693697
page_before = self.har.har['log']['pages'][0]
694698
for page in self.har.har['log']['pages'][1:]:
@@ -710,6 +714,9 @@ def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
710714
page_root_node = self.url_tree
711715
self.logger.warning('The pages in the HAR are in in the wrong order, this should not happen but here we are')
712716
self._make_subtree(page_root_node, [node])
717+
else:
718+
# no way to attach it to anything else, attach to the root node
719+
self._make_subtree(self.url_tree, [node])
713720

714721
@trace_make_subtree
715722
def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None, dev_debug: bool=False) -> None:
@@ -723,7 +730,6 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
723730
for unode in nodes_to_attach:
724731
if dev_debug:
725732
self.logger.warning(f'Attaching URLNode {unode.name} to {root.name}.')
726-
727733
unodes.append(root.add_child(unode))
728734

729735
if dev_debug:
@@ -769,8 +775,13 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
769775
else:
770776
self.logger.warning(f'The URLNode has a redirect to something we already processed ({unode.redirect_url}), this should not happen.')
771777

772-
# The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
778+
# 2025-02-06: If a node has no redirect **and** no content (empty response), we don't want to attach anything to it (it is a leaf)
779+
# Example: A POST to self that triggers the **parent** to load an other URL. In this case,
780+
# the proper attachment point is the parent, not this node, even if we have other nodes with this node URL as a referer.
781+
if unode.empty_response:
782+
continue
773783

784+
# The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
774785
if self.all_initiator_url.get(unode.name):
775786
# The URL (unode.name) is in the list of known urls initiating calls
776787
for u in self.all_initiator_url[unode.name]:
@@ -792,16 +803,13 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
792803
for ref in _referer_strings:
793804
if self.all_referer.get(ref):
794805
matching_urls = []
806+
# 2024-11-20: Referers are kinda weak, we can have multiple URLs with the same referer even one of the nodes should be attached somewhere else.
807+
# Let's attach the nodes one by one there (if they've not been attached recursively)
795808
for u in self.all_referer[ref]:
796-
matching_urls += [url_node for url_node in self.all_url_requests[u]
797-
if url_node in self._nodes_list
798-
and 'referer' in url_node.features
799-
and url_node.referer == ref]
800-
self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
801-
if dev_debug:
802-
self.logger.warning(f'Found via referer from {unode.name} to {matching_urls}.')
803-
# 2022-04-27: build subtrees recursively *after* we find all the best referer matches
804-
self._make_subtree(unode, matching_urls)
809+
for url_node in self.all_url_requests[u]:
810+
if url_node in self._nodes_list and hasattr(url_node, 'referer') and url_node.referer == ref:
811+
self._nodes_list = [node for node in self._nodes_list if node != url_node]
812+
self._make_subtree(unode, [url_node])
805813

806814
if 'external_ressources' in unode.features:
807815
# the url loads external things, and some of them have no referer....
@@ -811,7 +819,13 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
811819
# We have a lot of false positives
812820
# 2021-06-19: or the URL of the final redirect is somewhere in an embeded content. In that case, we don't want to attach to the sub-node.
813821
continue
814-
matching_urls = [url_node for url_node in self.all_url_requests[link] if url_node in self._nodes_list]
822+
# 2024-11-21: We only want to attach via external ressources *if*
823+
# 1. the node has no referer
824+
# 2. the node has a referer, but it is not in the list of potental nodes we can attach them to so they would be dangling.
825+
826+
matching_urls = [url_node for url_node in self.all_url_requests[link]
827+
if url_node in self._nodes_list
828+
and (not hasattr(url_node, 'referer') or url_node.referer not in self.all_url_requests)]
815829
self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
816830
if dev_debug:
817831
self.logger.warning(f'Found from {unode.name} via external ressources ({external_tag}): {matching_urls}.')

0 commit comments

Comments
 (0)