From c4f31f45de962966a9444a46f23693fae0a3baaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Sachse?= <joerg.sachse@slub-dresden.de> Date: Thu, 15 Sep 2022 14:49:27 +0200 Subject: [PATCH] feat: add more sources for information on corrupt DB blocks --- .../43200/check_oracle_db_table_integrity.sh | 125 +++++++++++++++--- 1 file changed, 104 insertions(+), 21 deletions(-) diff --git a/files/usr/lib/check_mk_agent/local/43200/check_oracle_db_table_integrity.sh b/files/usr/lib/check_mk_agent/local/43200/check_oracle_db_table_integrity.sh index 466f2ff..2f2aba0 100755 --- a/files/usr/lib/check_mk_agent/local/43200/check_oracle_db_table_integrity.sh +++ b/files/usr/lib/check_mk_agent/local/43200/check_oracle_db_table_integrity.sh @@ -7,7 +7,7 @@ # REQUIREMENTS: # - Bash 4 or higher -# - Oracle 11 DB or higher with sqlplus installed +# - Oracle 11 DB or higher with sqlplus & rman installed scriptname=$( basename "${0}" ".sh" ) LOCKFILE="/var/lock/${scriptname}.lock" @@ -16,12 +16,90 @@ status=3 itemname='Oracle corrupt table blocks' perf_values="-" -# check if sqlplus is installed -BINARY="$(su oracle -c 'command -v sqlplus')" -[[ ${BINARY} ]] || exit 1 +# check if sqlplus & rman is installed +SQLP_BINARY="$(su oracle -c 'command -v sqlplus')" +[[ ${SQLP_BINARY} ]] || exit 1 +RMAN_BINARY="$(su oracle -c 'command -v rman')" +[[ ${RMAN_BINARY} ]] || exit 1 -create_commandlist(){ - /bin/su - oracle -c"${BINARY} -S / as sysdba <<-\"EOF\" +DATABASE_SID=$( sudo /bin/su - oracle -c'echo $ORACLE_SID' ) + +grep_for_errors() { + if [[ -n "${1}" ]]; then + local FILE="${1}" + else + echo "ERROR: You need to pass a logfile to grep_for_errors()." && exit 1 + fi + + # THE FOLLOWING ERRORS ARE DETECTED: + # - ORA-01210: data file header is media corrupt + # (https://docs.oracle.com/database/121/ERRMG/ORA-00910.htm#ERRMG-GUID-D9EBDFFA-88C6-4185-BD2C-E1B959A97274) + # - ORA-01578: ORACLE data block corrupted (file # string, block # string) + # (https://docs.oracle.com/database/121/ERRMG/ORA-01500.htm#ERRMG-GUID-65B2B9E5-7075-4D53-91B8-FCAECA0AEE0E) + # FEEL FREE TO ADD MORE ERRORS AS NECESSARY. + if grep -q "ORA-01210" "${FILE}" || \ + grep -q "ORA-01578" "${FILE}"; then + ORACLE_OUTPUT=3 + fi +} + +# Runs 35 sec on Dev system as of 2022-09-15. That should be plenty fast. +# https://db.geeksinsight.com/2012/11/15/basics-corruptions-series-3-how-to-find-physical-corruptions-and-limitations-with-approach/ +# All the limitations that apply to DBV are applicable to RMAN VALIDATE also: +# - DBV cannot understand any table / index rowcount mismatch. +# - DBV cannot understand any complex corruptions, especially corruptions +# below cache layer of a datablock. +# - Some blocks that may not be part of Oracle, they would have been dropped. But +# DBV will still report that block as corrupted. When you check with the +# query against dba_extents (given below) there wont be any rows returned. +# And this corrupt block will not affect normal database operations as +# Oracle is not going to read this block. But while performing RMAN backups +# we still report this block as corrupted. +# - Some types of corruptions that cannot be explored while we do an OS level +# read. +# So, this function is indeed fast, but at the cost of not finding as many +# possible corruptions. +# https://docs.oracle.com/en/database/oracle/oracle-database/19/rcmrf/BACKUP.html#GUID-73642FF2-43C5-48B2-9969-99001C52EB50 +check_rman() { + /bin/su - oracle -c"echo 'BACKUP VALIDATE CHECK LOGICAL DATABASE;' | rman target / > /tmp/${scriptname}_rman.log" +} + +create_sqlp_commandlist_view() { + /bin/su - oracle -c"cat <<-'EOF' + SET TERMOUT OFF + SET PAGESIZE 0 + /* SET FEEDBACK OFF makes sure that we only get output if there's actual data. */ + SET FEEDBACK OFF + SET VERIFY OFF + SET TRIMSPOOL ON + + /* write temporary file with all the instructions */ + SPOOL /tmp/${scriptname}.sqlview + /* Show the complete table contents. */ + select * from v\$database_block_corruption; + + /* + For each row, find the segment impacted. + This can only be used if you have the absolute file number and block_id that + you get in alert log or by other means. We can't just call it and get a + result willy nilly without extraction identifiers from alert logs first. + We'll comment that out and just leave it in for documentation. + */ + /* + SELECT TABLESPACE_NAME, SEGMENT_NAME, PARTITION_NAME, SEGMENT_TYPE + FROM DBA_EXTENTS + WHERE FILE_ID=&FILE_ID AND + &BLOCK_ID BETWEEN BLOCK_ID AND BLOCK_ID + BLOCKS - 1; + */ + SPOOL OFF + exit; + 'EOF'" > "/tmp/${scriptname}.sqlview" +} + +# For large Databases, this function can take several hours. We'll leave it in +# for now to because it's quite thorough, but it's not nice. +create_sqlp_commandlist_validate(){ + /bin/su - oracle -c"${SQLP_BINARY} -S / as sysdba <<-\"EOF\" /* shamelessly stolen (and adapted) from: https://oracle-base.com/dba/script?category=miscellaneous&file=analyze_all.sql @@ -55,26 +133,28 @@ create_commandlist(){ echo "exit;" >> "/tmp/${scriptname}.tempsql" } -# IMPORTANT: Set lock using "flock", NOT "touch"!!! It's atomic and doesn't have to be cleared after the script ran. +# IMPORTANT: Set lock using "flock", NOT "touch"!!! It's atomic and doesn't +# have to be cleared after the script ran. ( flock -n 9 || exit 1 + check_rman + + create_sqlp_commandlist_view + /bin/su - oracle -c"${SQLP_BINARY} -S / as sysdba @/tmp/${scriptname}.sqlview > /tmp/${scriptname}_view.log" + if [[ ( ! -e "/tmp/${scriptname}.tempsql" ) || \ ( $(date -r "/tmp/${scriptname}.tempsql" +%s) -lt $(date -d 'now - 14 days' +%s) ) ]]; then - create_commandlist + create_sqlp_commandlist_validate fi + /bin/su - oracle -c"${SQLP_BINARY} -S / as sysdba @/tmp/${scriptname}.tempsql > /tmp/${scriptname}_validate.log" - /bin/su - oracle -c"${BINARY} -S / as sysdba @/tmp/${scriptname}.tempsql > /tmp/${scriptname}.log" - - # THE FOLLOWING ERRORS ARE DETECTED: - # - ORA-01210: data file header is media corrupt - # (https://docs.oracle.com/database/121/ERRMG/ORA-00910.htm#ERRMG-GUID-D9EBDFFA-88C6-4185-BD2C-E1B959A97274) - # - ORA-01578: ORACLE data block corrupted (file # string, block # string) - # (https://docs.oracle.com/database/121/ERRMG/ORA-01500.htm#ERRMG-GUID-65B2B9E5-7075-4D53-91B8-FCAECA0AEE0E) - # FEEL FREE TO ADD MORE ERRORS AS NECESSARY. - if grep -q "ORA-01210" "/tmp/${scriptname}.log" || \ - grep -q "ORA-01578" "/tmp/${scriptname}.log"; then - ORACLE_OUTPUT=3 - fi + LOGS="/tmp/${scriptname}_view.log + /tmp/${scriptname}_validate.log + /tmp/${scriptname}_rman.log + /exlibris/app/oracle/diag/rdbms/${DATABASE_SID}/${DATABASE_SID}/trace/alert_${DATABASE_SID}.log" + for LOG in ${LOGS}; do + grep_for_errors "${LOG}" + done if [[ ( ${ORACLE_OUTPUT} -eq 0 ) ]]; then status=0 @@ -90,6 +170,9 @@ flock -n 9 || exit 1 ) 9>"${LOCKFILE}" rm -f "/tmp/${scriptname}.tempsql" -rm -f "/tmp/${scriptname}.log" +rm -f "/tmp/${scriptname}_validate.log" +rm -f "/tmp/${scriptname}.sqlview" +rm -f "/tmp/${scriptname}_view.log" +rm -f "/tmp/${scriptname}_rman.log" # Code checked by shellcheck (https://github.com/koalaman/shellcheck) on 2022-09-09 -- GitLab