7f1cad6faf
We have an alert on `repair_too_many_nodes_failed` which fires too frequently. Every time so far, it has been because of a network blip of some nature on the satellite side. Satellite operators are expected to have other means in place for alerting on network problems and fixing them, so it's not necessary for the repair framework to act in that way. Instead, in this change, we change the way that `repair_too_many_nodes_failed` works. When a repair fails, we collect piece fetch errors by type and determine from them whether it looks like we are having network problems (most errors are connection failures, possibly also some successful connections which subsequently time out) or whether something else has happened. We will now only emit `repair_too_many_nodes_failed` when the outcome does not look like a network failure. In the network failure case, we will instead emit `repair_suspected_network_problem`. Refs: https://github.com/storj/storj/issues/4669 Change-Id: I49df98da5df9c606b95ad08a2bdfec8092fba926
151 lines
10 KiB
Plaintext
151 lines
10 KiB
Plaintext
storj.io/storj/private/lifecycle."slow_shutdown" Event
|
|
storj.io/storj/private/lifecycle."unexpected_shutdown" Event
|
|
storj.io/storj/satellite/accounting."bucket_bytes" IntVal
|
|
storj.io/storj/satellite/accounting."bucket_objects" IntVal
|
|
storj.io/storj/satellite/accounting."bucket_segments" IntVal
|
|
storj.io/storj/satellite/accounting."total_bytes" IntVal
|
|
storj.io/storj/satellite/accounting."total_objects" IntVal
|
|
storj.io/storj/satellite/accounting."total_segments" IntVal
|
|
storj.io/storj/satellite/accounting/tally."nodetallies.totalsum" IntVal
|
|
storj.io/storj/satellite/audit."audit_contained_nodes" IntVal
|
|
storj.io/storj/satellite/audit."audit_contained_nodes_global" Meter
|
|
storj.io/storj/satellite/audit."audit_contained_percentage" FloatVal
|
|
storj.io/storj/satellite/audit."audit_fail_nodes" IntVal
|
|
storj.io/storj/satellite/audit."audit_fail_nodes_global" Meter
|
|
storj.io/storj/satellite/audit."audit_failed_percentage" FloatVal
|
|
storj.io/storj/satellite/audit."audit_offline_nodes" IntVal
|
|
storj.io/storj/satellite/audit."audit_offline_nodes_global" Meter
|
|
storj.io/storj/satellite/audit."audit_offline_percentage" FloatVal
|
|
storj.io/storj/satellite/audit."audit_success_nodes" IntVal
|
|
storj.io/storj/satellite/audit."audit_success_nodes_global" Meter
|
|
storj.io/storj/satellite/audit."audit_successful_percentage" FloatVal
|
|
storj.io/storj/satellite/audit."audit_total_nodes" IntVal
|
|
storj.io/storj/satellite/audit."audit_total_nodes_global" Meter
|
|
storj.io/storj/satellite/audit."audit_total_pointer_nodes" IntVal
|
|
storj.io/storj/satellite/audit."audit_total_pointer_nodes_global" Meter
|
|
storj.io/storj/satellite/audit."audit_unknown_nodes" IntVal
|
|
storj.io/storj/satellite/audit."audit_unknown_nodes_global" Meter
|
|
storj.io/storj/satellite/audit."audit_unknown_percentage" FloatVal
|
|
storj.io/storj/satellite/audit."audited_percentage" FloatVal
|
|
storj.io/storj/satellite/audit."could_not_verify_audit_shares" Counter
|
|
storj.io/storj/satellite/audit."reverify_contained" IntVal
|
|
storj.io/storj/satellite/audit."reverify_contained_global" Meter
|
|
storj.io/storj/satellite/audit."reverify_contained_in_segment" IntVal
|
|
storj.io/storj/satellite/audit."reverify_fails" IntVal
|
|
storj.io/storj/satellite/audit."reverify_fails_global" Meter
|
|
storj.io/storj/satellite/audit."reverify_offlines" IntVal
|
|
storj.io/storj/satellite/audit."reverify_offlines_global" Meter
|
|
storj.io/storj/satellite/audit."reverify_successes" IntVal
|
|
storj.io/storj/satellite/audit."reverify_successes_global" Meter
|
|
storj.io/storj/satellite/audit."reverify_total_in_segment" IntVal
|
|
storj.io/storj/satellite/audit."reverify_unknown" IntVal
|
|
storj.io/storj/satellite/audit."reverify_unknown_global" Meter
|
|
storj.io/storj/satellite/audit."verify_shares_downloaded_successfully" IntVal
|
|
storj.io/storj/satellite/console."create_user_attempt" Counter
|
|
storj.io/storj/satellite/console."create_user_captcha_error" Counter
|
|
storj.io/storj/satellite/console."create_user_captcha_unsuccessful" Counter
|
|
storj.io/storj/satellite/console."create_user_duplicate_unverified" Counter
|
|
storj.io/storj/satellite/console."create_user_duplicate_verified" Counter
|
|
storj.io/storj/satellite/console."create_user_success" Counter
|
|
storj.io/storj/satellite/console."login_attempt" Counter
|
|
storj.io/storj/satellite/console."login_email_invalid" Counter
|
|
storj.io/storj/satellite/console."login_email_unverified" Counter
|
|
storj.io/storj/satellite/console."login_failed" Counter
|
|
storj.io/storj/satellite/console."login_invalid_password" Counter
|
|
storj.io/storj/satellite/console."login_locked_out" Counter
|
|
storj.io/storj/satellite/console."login_lockout_initiated" Counter
|
|
storj.io/storj/satellite/console."login_lockout_reinitiated" Counter
|
|
storj.io/storj/satellite/console."login_mfa_conflict" Counter
|
|
storj.io/storj/satellite/console."login_mfa_missing" Counter
|
|
storj.io/storj/satellite/console."login_mfa_passcode_failure" Counter
|
|
storj.io/storj/satellite/console."login_mfa_passcode_success" Counter
|
|
storj.io/storj/satellite/console."login_mfa_recovery_failure" Counter
|
|
storj.io/storj/satellite/console."login_mfa_recovery_success" Counter
|
|
storj.io/storj/satellite/console."login_success" Counter
|
|
storj.io/storj/satellite/console."login_user_captcha_error" Counter
|
|
storj.io/storj/satellite/console."login_user_captcha_unsuccessful" Counter
|
|
storj.io/storj/satellite/console."login_user_failed_count" IntVal
|
|
storj.io/storj/satellite/contact."failed_dial" Event
|
|
storj.io/storj/satellite/contact."failed_ping_node" Event
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_fail_max_failures_percentage" Meter
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_fail_validation" Meter
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_final_bytes_transferred" IntVal
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_final_pieces_failed" IntVal
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_final_pieces_succeess" IntVal
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_init_node_age_seconds" FloatVal
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_init_node_audit_success_count" IntVal
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_init_node_audit_total_count" IntVal
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_init_node_piece_count" IntVal
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_success" Meter
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_successful_pieces_transfer_ratio" IntVal
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_transfer_piece_fail" Meter
|
|
storj.io/storj/satellite/gracefulexit."graceful_exit_transfer_piece_success" Meter
|
|
storj.io/storj/satellite/metabase/segmentloop."segmentloop_error" Event
|
|
storj.io/storj/satellite/metabase/segmentloop."segmentsProcessed" IntVal
|
|
storj.io/storj/satellite/metabase/segmentloop.*Service.RunOnce Task
|
|
storj.io/storj/satellite/metainfo."metainfo_rate_limit_exceeded" Event
|
|
storj.io/storj/satellite/metainfo/piecedeletion."delete_batch_size" IntVal
|
|
storj.io/storj/satellite/metainfo/piecedeletion."deletion_pieces_unhandled_count" IntVal
|
|
storj.io/storj/satellite/metrics."total_inline_bytes" IntVal
|
|
storj.io/storj/satellite/metrics."total_inline_segments" IntVal
|
|
storj.io/storj/satellite/metrics."total_remote_bytes" IntVal
|
|
storj.io/storj/satellite/metrics."total_remote_segments" IntVal
|
|
storj.io/storj/satellite/orders."download_failed_not_enough_pieces_uplink" Meter
|
|
storj.io/storj/satellite/repair/checker."checker_injured_segment_health" FloatVal
|
|
storj.io/storj/satellite/repair/checker."checker_segment_age" IntVal
|
|
storj.io/storj/satellite/repair/checker."checker_segment_health" FloatVal
|
|
storj.io/storj/satellite/repair/checker."checker_segment_healthy_count" IntVal
|
|
storj.io/storj/satellite/repair/checker."checker_segment_time_until_irreparable" IntVal
|
|
storj.io/storj/satellite/repair/checker."checker_segment_total_count" IntVal
|
|
storj.io/storj/satellite/repair/checker."checker_segments_below_min_req" Counter
|
|
storj.io/storj/satellite/repair/checker."healthy_segments_removed_from_queue" IntVal
|
|
storj.io/storj/satellite/repair/checker."new_remote_segments_needing_repair" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_files_checked" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_files_lost" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_checked" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_failed_to_check" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_healthy_percentage" FloatVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_lost" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_needing_repair" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_1" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_2" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_3" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_4" IntVal
|
|
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_5" IntVal
|
|
storj.io/storj/satellite/repair/repairer."download_failed_not_enough_pieces_repair" Meter
|
|
storj.io/storj/satellite/repair/repairer."healthy_ratio_after_repair" FloatVal
|
|
storj.io/storj/satellite/repair/repairer."healthy_ratio_before_repair" FloatVal
|
|
storj.io/storj/satellite/repair/repairer."repair_attempts" Meter
|
|
storj.io/storj/satellite/repair/repairer."repair_bytes_downloaded" Meter
|
|
storj.io/storj/satellite/repair/repairer."repair_bytes_uploaded" Meter
|
|
storj.io/storj/satellite/repair/repairer."repair_failed" Meter
|
|
storj.io/storj/satellite/repair/repairer."repair_nodes_unavailable" Meter
|
|
storj.io/storj/satellite/repair/repairer."repair_partial" Meter
|
|
storj.io/storj/satellite/repair/repairer."repair_segment_pieces_canceled" IntVal
|
|
storj.io/storj/satellite/repair/repairer."repair_segment_pieces_failed" IntVal
|
|
storj.io/storj/satellite/repair/repairer."repair_segment_pieces_successful" IntVal
|
|
storj.io/storj/satellite/repair/repairer."repair_segment_pieces_total" IntVal
|
|
storj.io/storj/satellite/repair/repairer."repair_segment_size" IntVal
|
|
storj.io/storj/satellite/repair/repairer."repair_success" Meter
|
|
storj.io/storj/satellite/repair/repairer."repair_suspected_network_problem" Meter
|
|
storj.io/storj/satellite/repair/repairer."repair_too_many_nodes_failed" Meter
|
|
storj.io/storj/satellite/repair/repairer."repair_unnecessary" Meter
|
|
storj.io/storj/satellite/repair/repairer."repairer_segments_below_min_req" Counter
|
|
storj.io/storj/satellite/repair/repairer."segment_deleted_before_repair" Meter
|
|
storj.io/storj/satellite/repair/repairer."segment_repair_count" IntVal
|
|
storj.io/storj/satellite/repair/repairer."segment_time_until_repair" IntVal
|
|
storj.io/storj/satellite/repair/repairer."time_for_repair" FloatVal
|
|
storj.io/storj/satellite/repair/repairer."time_since_checker_queue" FloatVal
|
|
storj.io/storj/satellite/satellitedb."audit_online_score" FloatVal
|
|
storj.io/storj/satellite/satellitedb."audit_reputation_alpha" FloatVal
|
|
storj.io/storj/satellite/satellitedb."audit_reputation_beta" FloatVal
|
|
storj.io/storj/satellite/satellitedb."bad_audit_dqs" Meter
|
|
storj.io/storj/satellite/satellitedb."offline_dqs" Meter
|
|
storj.io/storj/satellite/satellitedb."unknown_audit_reputation_alpha" FloatVal
|
|
storj.io/storj/satellite/satellitedb."unknown_audit_reputation_beta" FloatVal
|
|
storj.io/storj/satellite/satellitedb."unknown_suspension_dqs" Meter
|
|
storj.io/storj/storage/filestore."open_file_in_trash" Meter
|
|
storj.io/storj/storagenode/contact."satellite_contact_request" Meter
|
|
storj.io/storj/storagenode/gracefulexit."satellite_gracefulexit_request" Meter
|
|
storj.io/storj/storagenode/piecestore/usedserials."delete_random_serial" Meter
|