-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathparse_aams
More file actions
executable file
·113 lines (85 loc) · 2.66 KB
/
parse_aams
File metadata and controls
executable file
·113 lines (85 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/perl
# vim: shiftwidth=4 tabstop=4
#
# Copyright 2014-2015 by Seeweb s.r.l.
# Written by Marco d'Itri <md@Linux.IT>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
use warnings;
use strict;
use autodie;
use Regexp::Common 2013031201 qw(net URI);
use List::MoreUtils qw(uniq);
use File::Slurp;
# Set this to true if your domain censorship method automatically censors
# all the subdomains of a censored domain.
my $Remove_Subdomains = 1;
##############################################################################
die "Usage: $0 RTF_INPUT DOMAINS_OUTPUT\n"
if @ARGV != 2;
my $lists = parse_aams_file($ARGV[0]);
#die "Censoring URLs is not supported!\n"
# if $lists->{urls} and @{ $lists->{urls} };
#die "Censoring IPs is not supported!\n"
# if $lists->{ips} and @{ $lists->{ips} };
die "No domains found!\n"
if not $lists->{domains} or not @{ $lists->{domains} };
remove_subdomains($lists) if $Remove_Subdomains;
write_file($ARGV[1], { atomic => 1 },
map { "$_\n" } sort @{ $lists->{domains} });
exit 0;
##############################################################################
sub parse_aams_file {
my ($file) = @_;
open(my $fh, '<', $file);
my (@ips, @domains, @urls);
while (<$fh>) {
s/^\s+//;
s/[\r\n\s]+$//;
next if not $_;
# skip header line like 0000350_2025.29.08
next if /^\d+_\d{4}\.\d{2}\.\d{2}$/;
if (/^$RE{net}{IPv4}$/) {
push(@ips, $_);
next;
}
# convert URLs with no path to plain domains
$_ = $1 if m{^https?://([^/]+)/?$};
if (/^$RE{net}{domain}{-nospace}{-rfc1101}$/) {
push(@domains, lc $_);
next;
}
if (/^$RE{URI}{HTTP}{-scheme => 'https?'}$/) {
push(@urls, $_);
next;
}
warn "Invalid entry in $file: '$_'\n";
}
close $fh;
die "$file contains no data!\n" if not (@domains or @ips or @urls);
return {
ips => [ uniq @ips ],
domains => [ uniq @domains ],
urls => [ uniq @urls ],
};
}
##############################################################################
sub remove_subdomains {
my ($l) = @_;
return if not $l or not $l->{domains};
my $list = $l->{domains};
my %domains = map { $_ => undef } @$list;
for (my $i = 0; $i < @$list; $i++) {
my $domain = $list->[$i];
# remove each leading component of the domain and check if the new
# resulting domain is in our list
while ($domain =~ s/^.+?\.(.+\..+)$/$1/) {
delete $list->[$i] if exists $domains{$domain};
}
}
# remove the deleted domains from the list
$l->{domains} = [ grep { defined $_ } @{ $l->{domains} } ];
}