Reduce number of false positives when gravity unable to import domains - change the wording of the output (#5128)

This commit is contained in:
Adam Warner 2023-01-22 11:06:45 +00:00 committed by GitHub
commit 81a31b9e7b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -244,7 +244,7 @@ database_adlist_number() {
return; return;
fi fi
output=$( { printf ".timeout 30000\\nUPDATE adlist SET number = %i, invalid_domains = %i WHERE id = %i;\\n" "${num_source_lines}" "${num_invalid}" "${1}" | pihole-FTL sqlite3 "${gravityDBfile}"; } 2>&1 ) output=$( { printf ".timeout 30000\\nUPDATE adlist SET number = %i, invalid_domains = %i WHERE id = %i;\\n" "${num_domains}" "${num_non_domains}" "${1}" | pihole-FTL sqlite3 "${gravityDBfile}"; } 2>&1 )
status="$?" status="$?"
if [[ "${status}" -ne 0 ]]; then if [[ "${status}" -ne 0 ]]; then
@ -519,12 +519,12 @@ gravity_DownloadBlocklists() {
gravity_Blackbody=true gravity_Blackbody=true
} }
# num_target_lines does increase for every correctly added domain in pareseList() # num_total_imported_domains increases for each list processed
num_target_lines=0 num_total_imported_domains=0
num_source_lines=0 num_domains=0
num_invalid=0 num_non_domains=0
parseList() { parseList() {
local adlistID="${1}" src="${2}" target="${3}" incorrect_lines sample_incorrect_lines local adlistID="${1}" src="${2}" target="${3}" non_domains sample_non_domains tmp_non_domains_str false_positive
# This sed does the following things: # This sed does the following things:
# 1. Remove all lines containing no domains # 1. Remove all lines containing no domains
# 2. Remove all domains containing invalid characters. Valid are: a-z, A-Z, 0-9, dot (.), minus (-), underscore (_) # 2. Remove all domains containing invalid characters. Valid are: a-z, A-Z, 0-9, dot (.), minus (-), underscore (_)
@ -534,36 +534,65 @@ parseList() {
sed -r "/([^\.]+\.)+[^\.]{2,}/!d;/[^a-zA-Z0-9.\_-]/d;s/\.$//;s/$/,${adlistID}/;/.$/a\\" "${src}" >> "${target}" sed -r "/([^\.]+\.)+[^\.]{2,}/!d;/[^a-zA-Z0-9.\_-]/d;s/\.$//;s/$/,${adlistID}/;/.$/a\\" "${src}" >> "${target}"
# Find lines containing no domains or with invalid characters (see above) # Find lines containing no domains or with invalid characters (see above)
# Remove duplicates and limit to 5 domains # Remove duplicates from the list
mapfile -t incorrect_lines <<< "$(sed -r "/([^\.]+\.)+[^\.]{2,}/d" < "${src}")" mapfile -t non_domains <<< "$(sed -r "/([^\.]+\.)+[^\.]{2,}/d" < "${src}")"
mapfile -t -O "${#incorrect_lines[@]}" incorrect_lines <<< "$(sed -r "/[^a-zA-Z0-9.\_-]/!d" < "${src}")" mapfile -t -O "${#non_domains[@]}" non_domains <<< "$(sed -r "/[^a-zA-Z0-9.\_-]/!d" < "${src}")"
IFS=" " read -r -a sample_incorrect_lines <<< "$(tr ' ' '\n' <<< "${incorrect_lines[@]}" | sort -u | head -n 5| tr '\n' ' ')" IFS=" " read -r -a non_domains <<< "$(tr ' ' '\n' <<< "${non_domains[@]}" | sort -u | tr '\n' ' ')"
local num_target_lines_new num_correct_lines # A list of items of common local hostnames not to report as unusable
# Get number of lines in source file # Some lists (i.e StevenBlack's) contain these as they are supposed to be used as HOST files
num_source_lines="$(grep -c "^" "${src}")" # but flagging them as unusable causes more confusion than it's worth - so we suppress them from the output
# Get the new number of lines in destination file false_positives=(
num_target_lines_new="$(grep -c "^" "${target}")" "localhost"
# Number of new correctly added lines "localhost.localdomain"
num_correct_lines="$(( num_target_lines_new-num_target_lines ))" "local"
# Update number of lines in target file "broadcasthost"
num_target_lines="$num_target_lines_new" "localhost"
num_invalid="$(( num_source_lines-num_correct_lines ))" "ip6-localhost"
if [[ "${num_invalid}" -eq 0 ]]; then "ip6-loopback"
echo " ${INFO} Analyzed ${num_source_lines} domains" "lo0 localhost"
else "ip6-localnet"
echo " ${INFO} Analyzed ${num_source_lines} domains, ${num_invalid} domains invalid!" "ip6-mcastprefix"
fi "ip6-allnodes"
"ip6-allrouters"
"ip6-allhosts"
)
# Display sample of invalid lines if we found some # Read the unusable lines into a string
if [ ${#sample_incorrect_lines[@]} -ne 0 ]; then tmp_non_domains_str=" ${non_domains[*]} "
echo " Sample of invalid domains:" for false_positive in "${false_positives[@]}"; do
for each in "${sample_incorrect_lines[@]}" # Remove false positives from tmp_non_domains_str
tmp_non_domains_str="${tmp_non_domains_str/ ${false_positive} / }"
done
# Read the string back into an array
IFS=" " read -r -a non_domains <<< "${tmp_non_domains_str}"
# Get a sample of non-domain entries, limited to 5 (the list should already have been de-duplicated)
IFS=" " read -r -a sample_non_domains <<< "$(tr ' ' '\n' <<< "${non_domains[@]}" | head -n 5 | tr '\n' ' ')"
local tmp_new_imported_total
# Get the new number of domains in destination file
tmp_new_imported_total="$(grep -c "^" "${target}")"
# Number of imported lines for this file is the difference between the new total and the old total. (Or, the number of domains we just added.)
num_domains="$(( tmp_new_imported_total-num_total_imported_domains ))"
# Replace the running total with the new total.
num_total_imported_domains="$tmp_new_imported_total"
# Get the number of non_domains (this is the number of entries left after stripping the source of comments/duplicates/false positives/domains)
num_non_domains="${#non_domains[@]}"
# If there are unusable lines, we display some information about them. This is not error or major cause for concern.
if [[ "${num_non_domains}" -ne 0 ]]; then
echo " ${INFO} Imported ${num_domains} domains, ignoring ${num_non_domains} non-domain entries"
echo " Sample of non-domain entries:"
for each in "${sample_non_domains[@]}"
do do
echo " - ${each}" echo " - ${each}"
done done
else
echo " ${INFO} Imported ${num_domains} domains"
fi fi
} }
compareLists() { compareLists() {
local adlistID="${1}" target="${2}" local adlistID="${1}" target="${2}"
@ -716,8 +745,8 @@ gravity_DownloadBlocklistFromUrl() {
else else
echo -e " ${CROSS} List download failed: ${COL_LIGHT_RED}no cached list available${COL_NC}" echo -e " ${CROSS} List download failed: ${COL_LIGHT_RED}no cached list available${COL_NC}"
# Manually reset these two numbers because we do not call parseList here # Manually reset these two numbers because we do not call parseList here
num_source_lines=0 num_domains=0
num_invalid=0 num_non_domains=0
database_adlist_number "${adlistID}" database_adlist_number "${adlistID}"
database_adlist_status "${adlistID}" "4" database_adlist_status "${adlistID}" "4"
fi fi