@@ -159,11 +159,12 @@ def __init__(self, harfile: Path, capture_uuid: str):
159
159
# Used to find the root entry of a page in the capture
160
160
# NOTE 2020-05-19: Turns out multiple pages can have the exact same timestamp...
161
161
self .pages_start_times : dict [str , list [dict [str , Any ]]] = defaultdict (list )
162
- for page in self .har ['log' ]['pages' ]:
163
- self .pages_start_times [page ['startedDateTime' ]].append (page )
164
- # The first entry has a different start time as the one from the list, add that
165
- if self .entries :
166
- self .pages_start_times [self .initial_start_time ].append (self .har ['log' ]['pages' ][0 ])
162
+ if 'pages' in self .har ['log' ]:
163
+ for page in self .har ['log' ]['pages' ]:
164
+ self .pages_start_times [page ['startedDateTime' ]].append (page )
165
+ # The first entry has a different start time as the one from the list, add that
166
+ if self .entries :
167
+ self .pages_start_times [self .initial_start_time ].append (self .har ['log' ]['pages' ][0 ])
167
168
168
169
# Set to false if initial_redirects fails to find the chain.
169
170
self .need_tree_redirects = False
@@ -190,7 +191,7 @@ def _search_final_redirect(self) -> None:
190
191
self .final_redirect = self .final_redirect .split ('?' , 1 )[0 ]
191
192
self ._search_final_redirect ()
192
193
else :
193
- self .logger .warning (f'Unable to find the final redirect: { self .final_redirect } ' )
194
+ self .logger .info (f'Unable to find the final redirect: { self .final_redirect } ' )
194
195
195
196
@property
196
197
def number_entries (self ) -> int :
@@ -200,7 +201,7 @@ def number_entries(self) -> int:
200
201
@property
201
202
def initial_title (self ) -> str :
202
203
"""Title of the first page in the capture"""
203
- if self .har ['log' ]['pages' ][0 ]['title' ]:
204
+ if 'pages' in self . har [ 'log' ] and self .har ['log' ]['pages' ][0 ]['title' ]:
204
205
return self .har ['log' ]['pages' ][0 ]['title' ]
205
206
else :
206
207
return '!! No title found !!'
@@ -564,7 +565,8 @@ def rendered_node(self) -> URLNode:
564
565
if node := self .url_tree .get_first_by_feature ('name' , self .har .final_redirect , expect_missing = True ):
565
566
return node
566
567
567
- if self .har .final_redirect :
568
+ browser_errors = ['chrome-error' , 'about:blank' ]
569
+ if self .har .final_redirect and not any (self .har .final_redirect .startswith (r ) for r in browser_errors ):
568
570
self .logger .warning (f'Final redirect URL from adress bar not in tree: { self .har .final_redirect } ' )
569
571
else :
570
572
# No final redirect, already logged earlier.
@@ -639,10 +641,10 @@ def make_tree(self) -> URLNode:
639
641
if self .har .downloaded_filename and self .har .downloaded_file :
640
642
downloaded_file = (self .har .downloaded_filename if self .har .downloaded_filename else '' ,
641
643
self .har .downloaded_file if self .har .downloaded_file else None )
642
-
643
- self .rendered_node .add_rendered_features (list (self .all_url_requests .keys ()),
644
- rendered_html = self .har .html_content if self .har .html_content else None ,
645
- downloaded_file = downloaded_file )
644
+ if not self . rendered_node . empty_response :
645
+ self .rendered_node .add_rendered_features (list (self .all_url_requests .keys ()),
646
+ rendered_html = self .har .html_content if self .har .html_content else None ,
647
+ downloaded_file = downloaded_file )
646
648
647
649
# Initialize the hostname tree root
648
650
self .hostname_tree .add_url (self .url_tree )
@@ -669,26 +671,28 @@ def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
669
671
670
672
# Sometimes, the har has a list of pages, generally when we have HTTP redirects.
671
673
# IF we have more than one page in the list
672
- # AND the orphan node's pageref points to an other page than the first one <= FIXME not enabled yet
674
+ # AND the orphan node's pageref points to an other page than the first one
673
675
# AND we already have a node in the tree with this pageref
674
676
# => attach to that node.
675
- if len (self .har .har ['log' ]['pages' ]) > 1 and node .pageref != self .har .har ['log' ]['pages' ][0 ] and self .pages_root [node .pageref ] != node .uuid :
677
+ if ('pages' in self .har .har ['log' ] and len (self .har .har ['log' ]['pages' ]) > 1
678
+ and node .pageref != self .har .har ['log' ]['pages' ][0 ]
679
+ and self .pages_root [node .pageref ] != node .uuid ):
676
680
# In that case, we check if there is already a page with the pageref of the orphan node,
677
681
# and attach the node to that. NOTE: we can only do that if there is already a node with this pageref in the tree.
678
682
# This node is not a page root, we can attach it \o/
679
683
page_root_node = self .get_url_node_by_uuid (self .pages_root [node .pageref ])
680
684
if dev_debug :
681
685
self .logger .warning (f'Failed to attach URLNode in the normal process, attaching node to page { node .pageref } - Node: { page_root_node .uuid } - { page_root_node .name } .' )
682
686
self ._make_subtree (page_root_node , [node ])
683
- elif final_node := self .url_tree .get_first_by_feature ('name' , self .har .final_redirect , expect_missing = True ):
687
+ elif final_redirect := self .url_tree .get_first_by_feature ('name' , self .har .final_redirect , expect_missing = True ):
684
688
# Generally, when we have a bunch of redirects, they do not branch out before the final landing page
685
689
# *but* it is not always the case: some intermediary redirects will have calls to 3rd party pages.
686
690
# Hopefully, this last case was taken care of in the branch above.
687
691
# In this branch, we get the landing page after the redirects (if any), and attach the node to it.
688
692
if dev_debug :
689
693
self .logger .warning (f'Failed to attach URLNode in the normal process, attaching node to final redirect: { self .har .final_redirect } .' )
690
- self ._make_subtree (final_node , [node ])
691
- else :
694
+ self ._make_subtree (final_redirect , [node ])
695
+ elif 'pages' in self . har . har [ 'log' ] :
692
696
# No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
693
697
page_before = self .har .har ['log' ]['pages' ][0 ]
694
698
for page in self .har .har ['log' ]['pages' ][1 :]:
@@ -710,6 +714,9 @@ def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
710
714
page_root_node = self .url_tree
711
715
self .logger .warning ('The pages in the HAR are in in the wrong order, this should not happen but here we are' )
712
716
self ._make_subtree (page_root_node , [node ])
717
+ else :
718
+ # no way to attach it to anything else, attach to the root node
719
+ self ._make_subtree (self .url_tree , [node ])
713
720
714
721
@trace_make_subtree
715
722
def _make_subtree (self , root : URLNode , nodes_to_attach : list [URLNode ] | None = None , dev_debug : bool = False ) -> None :
@@ -723,7 +730,6 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
723
730
for unode in nodes_to_attach :
724
731
if dev_debug :
725
732
self .logger .warning (f'Attaching URLNode { unode .name } to { root .name } .' )
726
-
727
733
unodes .append (root .add_child (unode ))
728
734
729
735
if dev_debug :
@@ -769,8 +775,13 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
769
775
else :
770
776
self .logger .warning (f'The URLNode has a redirect to something we already processed ({ unode .redirect_url } ), this should not happen.' )
771
777
772
- # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
778
+ # 2025-02-06: If a node has no redirect **and** no content (empty response), we don't want to attach anything to it (it is a leaf)
779
+ # Example: A POST to self that triggers the **parent** to load an other URL. In this case,
780
+ # the proper attachment point is the parent, not this node, even if we have other nodes with this node URL as a referer.
781
+ if unode .empty_response :
782
+ continue
773
783
784
+ # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
774
785
if self .all_initiator_url .get (unode .name ):
775
786
# The URL (unode.name) is in the list of known urls initiating calls
776
787
for u in self .all_initiator_url [unode .name ]:
@@ -792,16 +803,13 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
792
803
for ref in _referer_strings :
793
804
if self .all_referer .get (ref ):
794
805
matching_urls = []
806
+ # 2024-11-20: Referers are kinda weak, we can have multiple URLs with the same referer even one of the nodes should be attached somewhere else.
807
+ # Let's attach the nodes one by one there (if they've not been attached recursively)
795
808
for u in self .all_referer [ref ]:
796
- matching_urls += [url_node for url_node in self .all_url_requests [u ]
797
- if url_node in self ._nodes_list
798
- and 'referer' in url_node .features
799
- and url_node .referer == ref ]
800
- self ._nodes_list = [node for node in self ._nodes_list if node not in matching_urls ]
801
- if dev_debug :
802
- self .logger .warning (f'Found via referer from { unode .name } to { matching_urls } .' )
803
- # 2022-04-27: build subtrees recursively *after* we find all the best referer matches
804
- self ._make_subtree (unode , matching_urls )
809
+ for url_node in self .all_url_requests [u ]:
810
+ if url_node in self ._nodes_list and hasattr (url_node , 'referer' ) and url_node .referer == ref :
811
+ self ._nodes_list = [node for node in self ._nodes_list if node != url_node ]
812
+ self ._make_subtree (unode , [url_node ])
805
813
806
814
if 'external_ressources' in unode .features :
807
815
# the url loads external things, and some of them have no referer....
@@ -811,7 +819,13 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
811
819
# We have a lot of false positives
812
820
# 2021-06-19: or the URL of the final redirect is somewhere in an embeded content. In that case, we don't want to attach to the sub-node.
813
821
continue
814
- matching_urls = [url_node for url_node in self .all_url_requests [link ] if url_node in self ._nodes_list ]
822
+ # 2024-11-21: We only want to attach via external ressources *if*
823
+ # 1. the node has no referer
824
+ # 2. the node has a referer, but it is not in the list of potental nodes we can attach them to so they would be dangling.
825
+
826
+ matching_urls = [url_node for url_node in self .all_url_requests [link ]
827
+ if url_node in self ._nodes_list
828
+ and (not hasattr (url_node , 'referer' ) or url_node .referer not in self .all_url_requests )]
815
829
self ._nodes_list = [node for node in self ._nodes_list if node not in matching_urls ]
816
830
if dev_debug :
817
831
self .logger .warning (f'Found from { unode .name } via external ressources ({ external_tag } ): { matching_urls } .' )
0 commit comments