From 79f4a7cef094ae7661ddfe2816f84b0585957523 Mon Sep 17 00:00:00 2001 From: Adam Warner Date: Fri, 20 Jan 2023 18:52:14 +0000 Subject: [PATCH 1/2] Maintain a list of valid hostnames (taken from the top of StevenBlack's host file), and discount them when calculating the number of "invalid" domains in a given list Soften the output message when reporting on unsuable domains Signed-off-by: Adam Warner --- gravity.sh | 67 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/gravity.sh b/gravity.sh index a5c944ce..e8dfb34d 100755 --- a/gravity.sh +++ b/gravity.sh @@ -244,7 +244,7 @@ database_adlist_number() { return; fi - output=$( { printf ".timeout 30000\\nUPDATE adlist SET number = %i, invalid_domains = %i WHERE id = %i;\\n" "${num_source_lines}" "${num_invalid}" "${1}" | pihole-FTL sqlite3 "${gravityDBfile}"; } 2>&1 ) + output=$( { printf ".timeout 30000\\nUPDATE adlist SET number = %i, invalid_domains = %i WHERE id = %i;\\n" "${num_source_lines}" "${num_unusable}" "${1}" | pihole-FTL sqlite3 "${gravityDBfile}"; } 2>&1 ) status="$?" if [[ "${status}" -ne 0 ]]; then @@ -522,9 +522,9 @@ gravity_DownloadBlocklists() { # num_target_lines does increase for every correctly added domain in pareseList() num_target_lines=0 num_source_lines=0 -num_invalid=0 +num_unusable=0 parseList() { - local adlistID="${1}" src="${2}" target="${3}" incorrect_lines sample_incorrect_lines + local adlistID="${1}" src="${2}" target="${3}" unusable_lines sample_unusable_lines tmp_unusuable_lines_str false_positive # This sed does the following things: # 1. Remove all lines containing no domains # 2. Remove all domains containing invalid characters. Valid are: a-z, A-Z, 0-9, dot (.), minus (-), underscore (_) @@ -534,10 +534,41 @@ parseList() { sed -r "/([^\.]+\.)+[^\.]{2,}/!d;/[^a-zA-Z0-9.\_-]/d;s/\.$//;s/$/,${adlistID}/;/.$/a\\" "${src}" >> "${target}" # Find lines containing no domains or with invalid characters (see above) - # Remove duplicates and limit to 5 domains - mapfile -t incorrect_lines <<< "$(sed -r "/([^\.]+\.)+[^\.]{2,}/d" < "${src}")" - mapfile -t -O "${#incorrect_lines[@]}" incorrect_lines <<< "$(sed -r "/[^a-zA-Z0-9.\_-]/!d" < "${src}")" - IFS=" " read -r -a sample_incorrect_lines <<< "$(tr ' ' '\n' <<< "${incorrect_lines[@]}" | sort -u | head -n 5| tr '\n' ' ')" + # Remove duplicates from the list + mapfile -t unusable_lines <<< "$(sed -r "/([^\.]+\.)+[^\.]{2,}/d" < "${src}")" + mapfile -t -O "${#unusable_lines[@]}" unusable_lines <<< "$(sed -r "/[^a-zA-Z0-9.\_-]/!d" < "${src}")" + IFS=" " read -r -a unusable_lines <<< "$(tr ' ' '\n' <<< "${unusable_lines[@]}" | sort -u | tr '\n' ' ')" + + # A list of items of common local hostnames not to report as unusable + # Some lists (i.e StevenBlack's) contain these as they are supposed to be used as HOST files + # but flagging them as unusable causes more confusion than it's worth - so we suppress them from the output + false_positives=( + "localhost" + "localhost.localdomain" + "local" + "broadcasthost" + "localhost" + "ip6-localhost" + "ip6-loopback" + "lo0 localhost" + "ip6-localnet" + "ip6-mcastprefix" + "ip6-allnodes" + "ip6-allrouters" + "ip6-allhosts" + ) + + # Read the unusable lines into a string + tmp_unusuable_lines_str=" ${unusable_lines[*]} " + for false_positive in "${false_positives[@]}"; do + # Remove false positives from tmp_unusuable_lines_str + tmp_unusuable_lines_str="${tmp_unusuable_lines_str/ ${false_positive} / }" + done + # Read the string back into an array + IFS=" " read -r -a unusable_lines <<< "${tmp_unusuable_lines_str}" + + # Get a sample of the incorrect lines, limited to 5 (the list should already have been de-duplicated) + IFS=" " read -r -a sample_unusable_lines <<< "$(tr ' ' '\n' <<< "${unusable_lines[@]}" | head -n 5 | tr '\n' ' ')" local num_target_lines_new num_correct_lines # Get number of lines in source file @@ -548,22 +579,20 @@ parseList() { num_correct_lines="$(( num_target_lines_new-num_target_lines ))" # Update number of lines in target file num_target_lines="$num_target_lines_new" - num_invalid="$(( num_source_lines-num_correct_lines ))" - if [[ "${num_invalid}" -eq 0 ]]; then - echo " ${INFO} Analyzed ${num_source_lines} domains" - else - echo " ${INFO} Analyzed ${num_source_lines} domains, ${num_invalid} domains invalid!" - fi + num_unusable="${#unusable_lines[@]}" - # Display sample of invalid lines if we found some - if [ ${#sample_incorrect_lines[@]} -ne 0 ]; then - echo " Sample of invalid domains:" - for each in "${sample_incorrect_lines[@]}" + if [[ "${num_unusable}" -ne 0 ]]; then + echo " ${INFO} Imported ${num_correct_lines} domains, ignoring ${num_unusable} non-domain entries" + echo " Sample of non-domain entries:" + for each in "${sample_unusable_lines[@]}" do - echo " - ${each}" + echo " - ${each}" done + else + echo " ${INFO} Imported ${num_correct_lines} domains" fi } + compareLists() { local adlistID="${1}" target="${2}" @@ -717,7 +746,7 @@ gravity_DownloadBlocklistFromUrl() { echo -e " ${CROSS} List download failed: ${COL_LIGHT_RED}no cached list available${COL_NC}" # Manually reset these two numbers because we do not call parseList here num_source_lines=0 - num_invalid=0 + num_unusable=0 database_adlist_number "${adlistID}" database_adlist_status "${adlistID}" "4" fi From 9939cf1d774ced34b528c37ea6a90137419315b5 Mon Sep 17 00:00:00 2001 From: Adam Warner Date: Sat, 21 Jan 2023 23:47:19 +0000 Subject: [PATCH 2/2] Rename some of the variables to hopefully make the process a little clearer Signed-off-by: Adam Warner --- gravity.sh | 62 +++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/gravity.sh b/gravity.sh index e8dfb34d..c2795442 100755 --- a/gravity.sh +++ b/gravity.sh @@ -244,7 +244,7 @@ database_adlist_number() { return; fi - output=$( { printf ".timeout 30000\\nUPDATE adlist SET number = %i, invalid_domains = %i WHERE id = %i;\\n" "${num_source_lines}" "${num_unusable}" "${1}" | pihole-FTL sqlite3 "${gravityDBfile}"; } 2>&1 ) + output=$( { printf ".timeout 30000\\nUPDATE adlist SET number = %i, invalid_domains = %i WHERE id = %i;\\n" "${num_domains}" "${num_non_domains}" "${1}" | pihole-FTL sqlite3 "${gravityDBfile}"; } 2>&1 ) status="$?" if [[ "${status}" -ne 0 ]]; then @@ -519,12 +519,12 @@ gravity_DownloadBlocklists() { gravity_Blackbody=true } -# num_target_lines does increase for every correctly added domain in pareseList() -num_target_lines=0 -num_source_lines=0 -num_unusable=0 +# num_total_imported_domains increases for each list processed +num_total_imported_domains=0 +num_domains=0 +num_non_domains=0 parseList() { - local adlistID="${1}" src="${2}" target="${3}" unusable_lines sample_unusable_lines tmp_unusuable_lines_str false_positive + local adlistID="${1}" src="${2}" target="${3}" non_domains sample_non_domains tmp_non_domains_str false_positive # This sed does the following things: # 1. Remove all lines containing no domains # 2. Remove all domains containing invalid characters. Valid are: a-z, A-Z, 0-9, dot (.), minus (-), underscore (_) @@ -535,9 +535,9 @@ parseList() { # Find lines containing no domains or with invalid characters (see above) # Remove duplicates from the list - mapfile -t unusable_lines <<< "$(sed -r "/([^\.]+\.)+[^\.]{2,}/d" < "${src}")" - mapfile -t -O "${#unusable_lines[@]}" unusable_lines <<< "$(sed -r "/[^a-zA-Z0-9.\_-]/!d" < "${src}")" - IFS=" " read -r -a unusable_lines <<< "$(tr ' ' '\n' <<< "${unusable_lines[@]}" | sort -u | tr '\n' ' ')" + mapfile -t non_domains <<< "$(sed -r "/([^\.]+\.)+[^\.]{2,}/d" < "${src}")" + mapfile -t -O "${#non_domains[@]}" non_domains <<< "$(sed -r "/[^a-zA-Z0-9.\_-]/!d" < "${src}")" + IFS=" " read -r -a non_domains <<< "$(tr ' ' '\n' <<< "${non_domains[@]}" | sort -u | tr '\n' ' ')" # A list of items of common local hostnames not to report as unusable # Some lists (i.e StevenBlack's) contain these as they are supposed to be used as HOST files @@ -559,37 +559,37 @@ parseList() { ) # Read the unusable lines into a string - tmp_unusuable_lines_str=" ${unusable_lines[*]} " + tmp_non_domains_str=" ${non_domains[*]} " for false_positive in "${false_positives[@]}"; do - # Remove false positives from tmp_unusuable_lines_str - tmp_unusuable_lines_str="${tmp_unusuable_lines_str/ ${false_positive} / }" + # Remove false positives from tmp_non_domains_str + tmp_non_domains_str="${tmp_non_domains_str/ ${false_positive} / }" done # Read the string back into an array - IFS=" " read -r -a unusable_lines <<< "${tmp_unusuable_lines_str}" + IFS=" " read -r -a non_domains <<< "${tmp_non_domains_str}" - # Get a sample of the incorrect lines, limited to 5 (the list should already have been de-duplicated) - IFS=" " read -r -a sample_unusable_lines <<< "$(tr ' ' '\n' <<< "${unusable_lines[@]}" | head -n 5 | tr '\n' ' ')" + # Get a sample of non-domain entries, limited to 5 (the list should already have been de-duplicated) + IFS=" " read -r -a sample_non_domains <<< "$(tr ' ' '\n' <<< "${non_domains[@]}" | head -n 5 | tr '\n' ' ')" - local num_target_lines_new num_correct_lines - # Get number of lines in source file - num_source_lines="$(grep -c "^" "${src}")" - # Get the new number of lines in destination file - num_target_lines_new="$(grep -c "^" "${target}")" - # Number of new correctly added lines - num_correct_lines="$(( num_target_lines_new-num_target_lines ))" - # Update number of lines in target file - num_target_lines="$num_target_lines_new" - num_unusable="${#unusable_lines[@]}" + local tmp_new_imported_total + # Get the new number of domains in destination file + tmp_new_imported_total="$(grep -c "^" "${target}")" + # Number of imported lines for this file is the difference between the new total and the old total. (Or, the number of domains we just added.) + num_domains="$(( tmp_new_imported_total-num_total_imported_domains ))" + # Replace the running total with the new total. + num_total_imported_domains="$tmp_new_imported_total" + # Get the number of non_domains (this is the number of entries left after stripping the source of comments/duplicates/false positives/domains) + num_non_domains="${#non_domains[@]}" - if [[ "${num_unusable}" -ne 0 ]]; then - echo " ${INFO} Imported ${num_correct_lines} domains, ignoring ${num_unusable} non-domain entries" + # If there are unusable lines, we display some information about them. This is not error or major cause for concern. + if [[ "${num_non_domains}" -ne 0 ]]; then + echo " ${INFO} Imported ${num_domains} domains, ignoring ${num_non_domains} non-domain entries" echo " Sample of non-domain entries:" - for each in "${sample_unusable_lines[@]}" + for each in "${sample_non_domains[@]}" do echo " - ${each}" done else - echo " ${INFO} Imported ${num_correct_lines} domains" + echo " ${INFO} Imported ${num_domains} domains" fi } @@ -745,8 +745,8 @@ gravity_DownloadBlocklistFromUrl() { else echo -e " ${CROSS} List download failed: ${COL_LIGHT_RED}no cached list available${COL_NC}" # Manually reset these two numbers because we do not call parseList here - num_source_lines=0 - num_unusable=0 + num_domains=0 + num_non_domains=0 database_adlist_number "${adlistID}" database_adlist_status "${adlistID}" "4" fi