使用 GNU Parallel 处理大型阻止列表

使用 GNU Parallel 处理大型阻止列表

这是一个正在进行的项目,已经进行了重大更新以加快列表处理速度。这里是主页对于任何有兴趣的人!

它通过将列表内容转换为原始主机来处理定义的列表lists.json,并将这些主机放入与其相应操作方法和主机格式匹配的列表中。顶部的常量build_lists.bash显示了每个变量的定义。

欢迎提供有关如何加快速度或修复错误的任何建议!还建议在以下位置运行该项目这个Docker环境。只需将此处给出的所有文件放在同一目录中,并./build_lists.bash在授予所有脚本运行权限后运行即可。另外建议更改 中的 useragent aria2.conf

输入:

脚本:

build_lists.bash

#!/usr/bin/env bash

#shopt -s extdebug     # or --debugging
set +H +o history     # disable history features (helps avoid errors from "!" in strings)
shopt -u cmdhist      # would be enabled and have no effect otherwise
shopt -s execfail     # ensure interactive and non-interactive runtime are similar
shopt -s extglob      # enable extended pattern matching (https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html)
set -euET -o pipefail # put bash into strict mode & have it give descriptive errors
umask 055             # change all generated file perms from 755 to 700

DOWNLOADS=$(mktemp -d)
TMP=$(mktemp -p "$DOWNLOADS")
METHOD_ALLOW='ALLOW'
METHOD_BLOCK='BLOCK'
FORMAT_DOMAIN='DOMAIN'
FORMAT_CIDR4='CIDR4'
FORMAT_CIDR6='CIDR6'
FORMAT_IPV4='IPV4'
FORMAT_IPV6='IPV6'
readonly DOWNLOADS TMP METHOD_ALLOW METHOD_BLOCK FORMAT_DOMAIN FORMAT_CIDR4 FORMAT_CIDR6 FORMAT_IPV4 FORMAT_IPV6

METHODS=("$METHOD_BLOCK" "$METHOD_ALLOW")
FORMATS=("$FORMAT_DOMAIN" "$FORMAT_IPV4" "$FORMAT_IPV6" "$FORMAT_CIDR4" "$FORMAT_CIDR6")
readonly -a METHODS
readonly -a FORMATS

# https://github.com/ildar-shaimordanov/perl-utils#sponge
sponge() {
    perl -ne '
    push @lines, $_;
    END {
        open(OUT, ">$file")
        or die "sponge: cannot open $file: $!\n";
        print OUT @lines;
        close(OUT);
    }
    ' -s -- -file="$1"
}

sorted() {
    parsort -bfiu -S 100% -T "$DOWNLOADS" "$1" | sponge "$1"
    echo "[INFO] Optimized: ${1}"
}

# params: blacklist, whitelist
apply_whitelist() {
    # https://askubuntu.com/a/562352
    # send each line into the temp file as it's processed instead of keeping it in memory
    parallel --pipe -k -j+0 grep --line-buffered -Fxvf "$2" - <"$1" >>"$TMP"
    cp "$TMP" "$1"
    : >"$TMP"
    echo "[INFO] Applied whitelist to: ${1}"
}

# params: ip list, cidr whitelist
apply_cidr_whitelist() {
    if test -f "$1"; then
        sem -j+0 grepcidr -vf "$2" <"$1" | sponge "$1"
        sem --wait
        echo "[INFO] Applied CIDR whitelist to: ${1}"
    fi
}

init() {
    trap 'rm -rf "$DOWNLOADS"' EXIT || exit 1
    mkdir -p build/
    : >logs/aria2.log
    chmod -t /tmp
}

cleanup() {
    chmod +t /tmp
}

main() {
    local cache
    local list
    local blacklist
    local results

    init

    for method in "${METHODS[@]}"; do
        cache="${DOWNLOADS}/${method}"

        echo "[INFO] Processing method: ${method}"

        set +e # temporarily disable strict fail, in case downloads fail
        jq -r --arg method "$method" 'to_entries[] |
            select(.value.content.retriever == "ARIA2" and .value.method == $method) |
            {key, mirrors: .value.mirrors} |
            (.mirrors | join("\t")), " out=\(.key)"' lists.json |
            aria2c -i- -d "$cache" --conf-path='./aria2.conf'
        set -e

        echo "[INFO] Downloaded ${method} lists!"

        for format in "${FORMATS[@]}"; do
            results="${cache}/${format}"
            mkdir -p "$results"

            echo "[INFO] Sending list results to: ${results}"

            find -P -O3 "$cache" -maxdepth 1 -type f -print0 |
                # https://www.gnu.org/software/parallel/parallel_tutorial.html#controlling-the-execution
                parallel -0 --use-cpus-instead-of-cores --jobs 0 --results "$results" -X ./apply_filters.bash {} "$method" "$format"

            list="build/${method}_${format}.txt"

            echo "[INFO] Processed: ${list}"

            find -P -O3 "$results" -type f -name stdout -exec cat -s {} + | sponge "$list"

            if [ -f "$list" ] && [ -s "$list" ]; then
                sorted "$list"

                if [[ "$method" == "$METHOD_ALLOW" ]]; then
                    blacklist="build/BLOCK_${format}.txt"
                    echo "[INFO] Applying whitelist: ${list}"

                    case "$format" in
                    "$FORMAT_CIDR4")
                        apply_cidr_whitelist "$blacklist" "$list"
                        apply_cidr_whitelist "build/BLOCK_IPV4.txt" "$list"
                        ;;
                    "$FORMAT_CIDR6")
                        apply_cidr_whitelist "$blacklist" "$list"
                        apply_cidr_whitelist "build/BLOCK_IPV6.txt" "$list"
                        ;;
                    *)
                        apply_whitelist "$blacklist" "$list"
                        ;;
                    esac
                else
                    # Remove IPs from the IP blacklists that are covered by the CIDR blacklists
                    case "$format" in
                    "$FORMAT_CIDR4")
                        apply_cidr_whitelist "build/BLOCK_IPV4.txt" "$list"
                        ;;
                    "$FORMAT_CIDR6")
                        apply_cidr_whitelist "build/BLOCK_IPV6.txt" "$list"
                        ;;
                    *) ;;
                    esac
                fi

                echo "[INFO] Processed ${method} ${format} list!"
            fi
        done
    done

    # https://superuser.com/questions/191889/how-can-i-list-only-non-empty-files-using-ls
    find -P -O3 ./build/ -size 0 -type f -name "*.txt" -exec rm {} \; # remove any empty lists
    find -P -O3 ./build/ -type f -name "*.txt" -exec sha256sum {} \; | sponge './build/CHECKSUMS.txt'

    cleanup
}

# https://github.com/koalaman/shellcheck/wiki/SC2218
main

apply_filters.bash

#!/usr/bin/env bash

get_ipv4s() {
    ipinfo grepip -4hox --nocolor
}

get_ipv6s() {
    ipinfo grepip -6hox --nocolor
}

get_domains_from_urls() {
    perl -MData::Validate::Domain=is_domain -MRegexp::Common=URI -nE 'while (/$RE{URI}{HTTP}{-scheme => "https?|udp"}{-keep}/g) {say $3 if is_domain($3, { domain_private_tld => { onion => 1 } })}' 2>/dev/null
}

get_ipv4s_from_urls() {
    perl -MData::Validate::IP=is_ipv4 -MRegexp::Common=URI -nE 'while (/$RE{URI}{HTTP}{-scheme => "https?|udp"}{-keep}/g) {say $3 if is_ipv4($3)}' 2>/dev/null
}

hostsblock() {
    gawk 'BEGIN{FS="[|^]"}/^\|\|([[:alnum:]_-]{1,63}\.)+[[:alpha:]]+\^(\$third-party)?$/{print tolower($3)}'
}

# params: column number
mlr_cut_col() {
    mlr --csv --skip-comments -N clean-whitespace then cut -f "$1"
}

process_list() {
    local FILE_PATH
    local LIST_METHOD
    local CONTENT_FILTER
    local CONTENT_TYPE
    local LIST_FILTER
    local LIST_FORMAT

    FILE_PATH="$1"
    LIST_METHOD="$2"
    CONTENT_FILTER="$3"
    CONTENT_TYPE="$4"
    LIST_FILTER="$5"
    LIST_FORMAT="$6"

    case "$CONTENT_FILTER" in
    'NONE') cat -s "$FILE_PATH" ;;
    '7Z') 7za -y -so e "$FILE_PATH" ;;
    'ZIP') zcat "$FILE_PATH" ;;
    'GZIP') gzip -cd "$FILE_PATH" ;;
    'TARBALL') tar -xOzf "$FILE_PATH" ;;
    'SQUIDGUARD') tar -xOzf "$FILE_PATH" --wildcards-match-slash --wildcards '*/domains' ;;
    'SCAFROGLIA') unzip -p "$FILE_PATH" blocklists-master/*.txt ;;
    'SHADOWWHISPERER') unzip -p "$FILE_PATH" BlockLists-master/RAW/* ;;
    'ESOX_LUCIUS') unzip -p "$FILE_PATH" PiHoleblocklists-main/* -x PiHoleblocklists-main/LICENSE PiHoleblocklists-main/README.md ;;
    esac |
        case "$CONTENT_TYPE" in
        'TEXT')
            case "$LIST_FILTER" in
            'NONE') cat -s ;;
            'RAW_HOSTS_WITH_COMMENTS') mawk '/^[^[:space:]|^#|^!|^;|^$|^:]/{print $1}' ;;
            'HOSTS_FILE') ghosts -m /dev/stdin -o -p -noheader -stats=false ;;
            'ABUSE_CH_URLHAUS_DOMAIN') get_domains_from_urls ;;
            'ABUSE_CH_URLHAUS_IPV4') get_ipv4s_from_urls ;;
            'ALIENVAULT') mawk -F# '{print $1}' ;;
            'ADBLOCK') hostsblock ;;
            'GREP_IPV4') get_ipv4s ;;
            'GREP_IPV6') get_ipv6s ;;
            'BOTVIRJ_IPV4') mawk -F'|' '{print $1}' ;;
            'CRYPTOLAEMUS_DOMAIN') hxextract code /dev/stdin | head -n -1 | tail -n +6 ;;
            'CRYPTOLAEMUS_IPV4') hxextract code /dev/stdin | head -n -1 | tail -n +6 | get_ipv4s ;;
            'CYBERCRIME_DOMAIN') mawk -F/ '{print $1}' ;;
            'CYBERCRIME_IPV4') mawk -F/ '{split($1,a,":");print a[1]}' | get_ipv4s ;;
            'DATAPLANE_IPV4') mawk -F'|' '$0~/^[^#]/{gsub(/ /,""); print $3}' ;;
            'DSHIELD') mlr --tsv --skip-comments -N put '$cidr = $1 . "/" . $3' then cut -f cidr ;;
            'MYIP_DOMAIN') mawk -F, '$0~/^[^#]/{print $2}' ;;
            'MYIP_IPV4') mawk '$0~/^[^#]/{print $1}' | get_ipv4s ;;
            'MYIP_IPV6') mawk '$0~/^[^#]/{print $1}' | get_ipv6s ;;
            'VXVAULT_DOMAIN') mawk '/^[http]/' | get_domains_from_urls ;;
            'VXVAULT_IPV4') mawk '/^[http]/' | get_ipv4s_from_urls ;;
            'XFILES') tr -d "[:blank:]" | hostsblock | mawk '{print $2}' ;;
            'TRACKERSLIST') mawk '{print $1}' | get_domains_from_urls ;;
            'CHARLES_B_HALEY') mawk '$0~/^[^#]/{print $3}' ;;
            'QUANTUMULTX') mawk -F, '$1~/^HOST-SUFFIX$/{print $2}' ;;
            'QUINDECIM') mawk -F= '$0~/^=/{print $2}' | mawk '{print $1}' ;;
            'ZEEK_DOMAIN') mawk '/^[^[:space:]|^#]/&&$2~/^Intel::DOMAIN$/{print $1}' ;;
            'ZEEK_IPV4') mawk '/^[^[:space:]|^#]/&&$2~/^Intel::ADDR$/{print $1}' ;;
            esac
            ;;
        'JSON')
            case "$LIST_FILTER" in
            'ABUSE_CH_FEODOTRACKER_IPV4') jq -r '.[].ip_address' ;;
            'ABUSE_CH_FEODOTRACKER_DOMAIN') jq -r '.[] | select(.hostname != null) | .hostname' ;;
            'ABUSE_CH_THREATFOX_IPV4') jq -r 'to_entries[].value[].ioc_value | split(":")[0]' ;;
            'ABUSE_CH_THREATFOX_DOMAIN') jq -r 'to_entries[].value[].ioc_value' ;;
            'AYASHIGE') jq -r '.[].fqdn' ;;
            'CYBER_CURE_IPV4') jq -r '.data.ip[]' ;;
            'CYBERSAIYAN_DOMAIN') jq -r '.[] | select(.value.type == "URL") | .indicator' | get_domains_from_urls ;;
            'CYBERSAIYAN_IPV4') jq -r '.[] | select(.value.type == "URL") | .indicator' | get_ipv4s_from_urls ;;
            'DISCONNECTME_ENTITIES') jq -r '.entities[] | "\(.properties[])\n\(.resources[])"' ;;
            'DISCONNECTME_SERVICES') jq -r '.categories[] | to_entries[].value[] | to_entries[].value[]' ;;
            'HIPO_UNIVERSITIES') jq -r '.[].domains | join("\n")' ;;
            'ISCSANS') jq -r '.[].ipv4' ;;
            'MALSILO_DOMAIN') jq -r '.data[].network_traffic | select(.dns != null) | .dns[]' ;;
            'MALSILO_IPV4') jq -r '.data[].network_traffic | select(.tcp != null) | .tcp[] | split(":")[0]' ;;
            'MALTRAIL') jq -r '.[].ip' ;;
            'TINYCHECK_DOMAIN') jq -r '.iocs[] | select(.type == "domain") | .value' ;;
            'TINYCHECK_FREEDNS') jq -r '.iocs[] | select(.type == "freedns") | .value' ;;
            'TINYCHECK_IPV4') jq -r '.iocs[] | select(.type == "ip4addr") | .value' ;;
            'TINYCHECK_CIDR') jq -r '.iocs[] | select(.type == "cidr") | .value' ;;
            'CHONG_LUA_DAO_DOMAIN') jq -r '.[].url' | get_domains_from_urls ;;
            'CHONG_LUA_DAO_IPV4') jq -r '.[].url' | get_ipv4s_from_urls ;;
            'INQUEST_DOMAIN') jq -r '.data[] | select(.artifact_type == "domain") | .artifact' ;;
            'INQUEST_IPV4') jq -r '.data[] | select(.artifact_type == "ipaddress") | .artifact' ;;
            'CERTEGO') jq -rs '.[].links[].url' | mawk -F/ '$5~/^domain$/{print $6}' ;;
            'SECUREDROP') jq -r '.[] | .onion_address as $onion | .organization_url | split("/")[2] as $org | $org, $onion' ;;
            esac
            ;;
        'CSV')
            case "$LIST_FILTER" in
            'MLR_CUT_1') mlr_cut_col 1 ;;
            'MLR_CUT_2') mlr_cut_col 2 ;;
            'MLR_CUT_3') mlr_cut_col 3 ;;
            'MLR_CUT_4') mlr_cut_col 4 ;;
            'BENKOW_DOMAIN') mlr --csv --headerless-csv-output --ifs ';' cut -f url | get_domains_from_urls ;;
            'BENKOW_IPV4') mlr --csv --headerless-csv-output --ifs ';' cut -f url | get_ipv4s_from_urls ;;
            'BOTVIRJ_COVID') mawk 'NR>1' ;;
            'CYBER_CURE_DOMAIN_URL') tr ',' '\n' | get_domains_from_urls ;;
            'MALWARE_DISCOVERER_DOMAIN') mlr --csv --headerless-csv-output cut -f domain ;;
            'MALWARE_DISCOVERER_IPV4') mlr --csv --headerless-csv-output cut -f ip ;;
            'PHISHSTATS_DOMAIN') mlr_cut_col 3 | get_domains_from_urls ;;
            'PHISHSTATS_IPV4') mlr_cut_col 4 | get_ipv4s ;;
            'PHISHSTATS_IPV6') mlr_cut_col 4 | get_ipv6s ;;
            'TURRIS') mlr --csv --headerless-csv-output --skip-comments cut -f Address ;;
            'VIRIBACK_DOMAIN') mlr --csv --headerless-csv-output cut -f URL | get_domains_from_urls ;;
            'VIRIBACK_IPV4') mlr --csv --headerless-csv-output cut -f IP ;;
            'SHADOWSERVER_HOST') mlr --csv --headerless-csv-output cut -f http_host ;;
            'SHADOWSERVER_TARGET') mlr --csv --headerless-csv-output cut -f redirect_target ;;
            'WATCHLIST_INTERNET') mlr --csv --ifs ';' -N cut -f 1 ;;
            'CRUZ_IT') mlr --csv --headerless-csv-output clean-whitespace then cut -f ip_address ;;
            'PHISHTANK') mlr --csv --headerless-csv-output cut -f url | get_domains_from_urls ;;
            'BLOCKLIST_UA') mlr --csv --ifs ';' --headerless-csv-output cut -f IP ;;
            esac
            ;;
        'YAML')
            case "$LIST_FILTER" in
            'CRYPTOSCAMDB_BLACKLIST') yq '.[].name' ;;
            'CRYPTOSCAMDB_WHITELIST') yq '.[].url' | get_domains_from_urls ;;
            esac
            ;;
        esac | mawk 'NF && !seen[$0]++' |
        case "$LIST_FORMAT" in
        'DOMAIN')
            perl ./process_domains.pl 2>/dev/null
            ;;
        # https://metacpan.org/pod/Data::Validate::IP
        'IPV4')
            case "$LIST_METHOD" in
            'BLOCK')
                perl -MData::Validate::IP=is_public_ipv4 -nE 'chomp; if(defined($_) && is_public_ipv4($_)) {say $_;}'
                ;;
            # Ensure bogons get whitelisted
            'ALLOW')
                perl -MData::Validate::IP=is_ipv4 -nE 'chomp; if(defined($_) && is_ipv4($_)) {say $_;}'
                ;;
            esac
            ;;
        'IPV6')
            case "$LIST_METHOD" in
            'BLOCK')
                perl -MData::Validate::IP=is_public_ipv6 -nE 'chomp; if(defined($_) && is_public_ipv6($_)) {say $_;}'
                ;;
            # Ensure bogons get whitelisted
            'ALLOW')
                perl -MData::Validate::IP=is_ipv6 -nE 'chomp; if(defined($_) && is_ipv6($_)) {say $_;}'
                ;;
            esac
            ;;
        'CIDR4')
            perl ./process_cidrs.pl 2>/dev/null
            ;;
        'CIDR6')
            perl ./process_cidrs.pl 2>/dev/null
            ;;
        esac
}

main() {
    jq -r --arg key "$(basename "$1")" --arg format "$3" 'to_entries[] |
        select(.key == $key) | .value |
        .content.filter as $content_filter |
        .content.type as $content_type |
        .formats[] |
        select(.format == $format) |
        "\($content_filter)#\($content_type)#\(.filter)"' lists.json |
        while IFS='#' read -r content_filter content_type list_filter; do
            process_list "$1" "$2" "$content_filter" "$content_type" "$list_filter" "$3"
        done
}

main "$1" "$2" "$3"

process_domains.pl

#!/usr/bin/env perl

use warnings;
use strict;
use open ':std', ':encoding(UTF-8)';
use feature 'say';
use Try::Tiny;
use Text::Trim 'trim';
use Net::IDN::Encode 'domain_to_ascii';
use Data::Validate::Domain 'is_domain';

while (<>) {
  chomp;

  try {
    my $domain = domain_to_ascii(trim($_));

    if (defined($domain) && is_domain($domain, { domain_private_tld => { onion => 1 } })) {
      say($domain);
    }
  }
}

process_cidrs.pl

#!/usr/bin/env perl

use warnings;
use strict;
use open ':std', ':encoding(UTF-8)';
use feature 'say';
use Try::Tiny;
use Text::Trim 'trim';
use Net::CIDR 'cidrvalidate';

while (<>) {
  chomp;

  try {
    # https://metacpan.org/pod/Net::CIDR#$ip=Net::CIDR::cidrvalidate($ip);
    my $cidr = cidrvalidate(trim($_));
    last if !defined $cidr;
    say $cidr;
  }
}

相关内容