Don't use intermediate strings to filter false positives in gravity (#5176)

This commit is contained in:
yubiuser 2023-02-16 19:02:43 +01:00 committed by GitHub
commit 2a61a03bdf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -524,7 +524,7 @@ num_total_imported_domains=0
num_domains=0 num_domains=0
num_non_domains=0 num_non_domains=0
parseList() { parseList() {
local adlistID="${1}" src="${2}" target="${3}" non_domains sample_non_domains tmp_non_domains_str false_positive local adlistID="${1}" src="${2}" target="${3}" non_domains sample_non_domains
# This sed does the following things: # This sed does the following things:
# 1. Remove all lines containing no domains # 1. Remove all lines containing no domains
# 2. Remove all domains containing invalid characters. Valid are: a-z, A-Z, 0-9, dot (.), minus (-), underscore (_) # 2. Remove all domains containing invalid characters. Valid are: a-z, A-Z, 0-9, dot (.), minus (-), underscore (_)
@ -542,30 +542,13 @@ parseList() {
# A list of items of common local hostnames not to report as unusable # A list of items of common local hostnames not to report as unusable
# Some lists (i.e StevenBlack's) contain these as they are supposed to be used as HOST files # Some lists (i.e StevenBlack's) contain these as they are supposed to be used as HOST files
# but flagging them as unusable causes more confusion than it's worth - so we suppress them from the output # but flagging them as unusable causes more confusion than it's worth - so we suppress them from the output
false_positives=( false_positives="localhost|localhost.localdomain|local|broadcasthost|localhost|ip6-localhost|ip6-loopback|lo0 localhost|ip6-localnet|ip6-mcastprefix|ip6-allnodes|ip6-allrouters|ip6-allhosts"
"localhost"
"localhost.localdomain"
"local"
"broadcasthost"
"localhost"
"ip6-localhost"
"ip6-loopback"
"lo0 localhost"
"ip6-localnet"
"ip6-mcastprefix"
"ip6-allnodes"
"ip6-allrouters"
"ip6-allhosts"
)
# Read the unusable lines into a string # if there are any non-domains, filter the array for false-positives
tmp_non_domains_str=" ${non_domains[*]} " # Credit: https://stackoverflow.com/a/40264051
for false_positive in "${false_positives[@]}"; do if [[ "${#non_domains[@]}" -gt 0 ]]; then
# Remove false positives from tmp_non_domains_str mapfile -d $'\0' -t non_domains < <(printf '%s\0' "${non_domains[@]}" | grep -Ezv "^${false_positives}")
tmp_non_domains_str="${tmp_non_domains_str/ ${false_positive} / }" fi
done
# Read the string back into an array
IFS=" " read -r -a non_domains <<< "${tmp_non_domains_str}"
# Get a sample of non-domain entries, limited to 5 (the list should already have been de-duplicated) # Get a sample of non-domain entries, limited to 5 (the list should already have been de-duplicated)
IFS=" " read -r -a sample_non_domains <<< "$(tr ' ' '\n' <<< "${non_domains[@]}" | head -n 5 | tr '\n' ' ')" IFS=" " read -r -a sample_non_domains <<< "$(tr ' ' '\n' <<< "${non_domains[@]}" | head -n 5 | tr '\n' ' ')"