Skip to content

Commit 81920c5

Browse files
committed
Support transient SMART failures
As discussed in [1] some SMART errors are transient and should not be treated as permanent. This commit adds support for a configurable list of ATA SMART attribute IDs for which failures will be treated as transient. Drive health history is still recorded and notifications are sent, but the device itself is not marked as failed. Fixes #374. [1] #374
1 parent 578a262 commit 81920c5

File tree

12 files changed

+152
-46
lines changed

12 files changed

+152
-46
lines changed

collector/pkg/config/interface.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ type Interface interface {
2020
GetInt(key string) int
2121
GetString(key string) string
2222
GetStringSlice(key string) []string
23+
GetIntSlice(key string) []int
2324
UnmarshalKey(key string, rawVal interface{}, decoderOpts ...viper.DecoderConfigOption) error
2425

2526
GetDeviceOverrides() []models.ScanOverride

collector/pkg/config/mock/mock_config.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

example.scrutiny.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ log:
5252
file: '' #absolute or relative paths allowed, eg. web.log
5353
level: INFO
5454

55+
failures:
56+
transient:
57+
ata:
58+
- 195 # Hardware_ECC_Recovered, see https://superuser.com/a/1511916/169872
5559

5660
# Notification "urls" look like the following. For more information about service specific configuration see
5761
# Shoutrrr's documentation: https://containrrr.dev/shoutrrr/services/overview/

webapp/backend/pkg/config/config.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
package config
22

33
import (
4-
"github.com/analogj/go-util/utils"
5-
"github.com/analogj/scrutiny/webapp/backend/pkg/errors"
6-
"github.com/spf13/viper"
74
"log"
85
"os"
96
"strings"
7+
8+
"github.com/analogj/go-util/utils"
9+
"github.com/analogj/scrutiny/webapp/backend/pkg/errors"
10+
"github.com/spf13/viper"
1011
)
1112

1213
const DB_USER_SETTINGS_SUBKEY = "user"
@@ -51,6 +52,8 @@ func (c *configuration) Init() error {
5152
c.SetDefault("web.influxdb.token", "scrutiny-default-admin-token")
5253
c.SetDefault("web.influxdb.retention_policy", true)
5354

55+
c.SetDefault("failures.transient.ata", []int{195})
56+
5457
//c.SetDefault("disks.include", []string{})
5558
//c.SetDefault("disks.exclude", []string{})
5659

webapp/backend/pkg/config/interface.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,6 @@ type Interface interface {
2525
GetInt64(key string) int64
2626
GetString(key string) string
2727
GetStringSlice(key string) []string
28+
GetIntSlice(key string) []int
2829
UnmarshalKey(key string, rawVal interface{}, decoderOpts ...viper.DecoderConfigOption) error
2930
}

webapp/backend/pkg/config/mock/mock_config.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

webapp/backend/pkg/constants.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ const DeviceProtocolAta = "ATA"
44
const DeviceProtocolScsi = "SCSI"
55
const DeviceProtocolNvme = "NVMe"
66

7-
//go:generate stringer -type=AttributeStatus
87
// AttributeStatus bitwise flag, 1,2,4,8,16,32,etc
8+
//
9+
//go:generate stringer -type=AttributeStatus
910
type AttributeStatus uint8
1011

1112
const (
@@ -23,8 +24,9 @@ func AttributeStatusClear(b, flag AttributeStatus) AttributeStatus { return b &
2324
func AttributeStatusToggle(b, flag AttributeStatus) AttributeStatus { return b ^ flag }
2425
func AttributeStatusHas(b, flag AttributeStatus) bool { return b&flag != 0 }
2526

26-
//go:generate stringer -type=DeviceStatus
2727
// DeviceStatus bitwise flag, 1,2,4,8,16,32,etc
28+
//
29+
//go:generate stringer -type=DeviceStatus
2830
type DeviceStatus uint8
2931

3032
const (

webapp/backend/pkg/database/scrutiny_repository_device_smart_attributes.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,22 @@ package database
33
import (
44
"context"
55
"fmt"
6+
"strings"
7+
"time"
8+
69
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
710
"github.com/analogj/scrutiny/webapp/backend/pkg/models/measurements"
811
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
912
"github.com/influxdata/influxdb-client-go/v2/api"
1013
log "github.com/sirupsen/logrus"
11-
"strings"
12-
"time"
1314
)
1415

15-
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
16+
// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1617
// SMART
17-
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
18+
// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1819
func (sr *scrutinyRepository) SaveSmartAttributes(ctx context.Context, wwn string, collectorSmartData collector.SmartInfo) (measurements.Smart, error) {
1920
deviceSmartData := measurements.Smart{}
20-
err := deviceSmartData.FromCollectorSmartInfo(wwn, collectorSmartData)
21+
err := deviceSmartData.FromCollectorSmartInfo(sr.appConfig, wwn, collectorSmartData)
2122
if err != nil {
2223
sr.logger.Errorln("Could not process SMART metrics", err)
2324
return measurements.Smart{}, err

webapp/backend/pkg/database/scrutiny_repository_migrations.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ import (
44
"context"
55
"errors"
66
"fmt"
7+
"strconv"
8+
"time"
9+
710
"github.com/analogj/scrutiny/webapp/backend/pkg"
811
"github.com/analogj/scrutiny/webapp/backend/pkg/database/migrations/m20201107210306"
912
"github.com/analogj/scrutiny/webapp/backend/pkg/database/migrations/m20220503120000"
@@ -17,8 +20,6 @@ import (
1720
"github.com/influxdata/influxdb-client-go/v2/api/http"
1821
log "github.com/sirupsen/logrus"
1922
"gorm.io/gorm"
20-
"strconv"
21-
"time"
2223
)
2324

2425
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -384,8 +385,8 @@ func (sr *scrutinyRepository) Migrate(ctx context.Context) error {
384385

385386
// helpers
386387

387-
//When adding data to influxdb, an error may be returned if the data point is outside the range of the retention policy.
388-
//This function will ignore retention policy errors, and allow the migration to continue.
388+
// When adding data to influxdb, an error may be returned if the data point is outside the range of the retention policy.
389+
// This function will ignore retention policy errors, and allow the migration to continue.
389390
func ignorePastRetentionPolicyError(err error) error {
390391
var influxDbWriteError *http.Error
391392
if errors.As(err, &influxDbWriteError) {
@@ -468,7 +469,7 @@ func m20201107210306_FromPreInfluxDBSmartResultsCreatePostInfluxDBSmartResults(d
468469
})
469470
}
470471

471-
postDeviceSmartData.ProcessAtaSmartInfo(preAtaSmartAttributesTable)
472+
postDeviceSmartData.ProcessAtaSmartInfo(nil, preAtaSmartAttributesTable)
472473

473474
} else if preDevice.IsNvme() {
474475
//info collector.SmartInfo

webapp/backend/pkg/models/measurements/smart.go

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@ package measurements
22

33
import (
44
"fmt"
5-
"github.com/analogj/scrutiny/webapp/backend/pkg"
6-
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
7-
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
85
"log"
96
"strconv"
107
"strings"
118
"time"
9+
10+
"github.com/analogj/scrutiny/webapp/backend/pkg"
11+
"github.com/analogj/scrutiny/webapp/backend/pkg/config"
12+
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
13+
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
1214
)
1315

1416
type Smart struct {
@@ -100,8 +102,8 @@ func NewSmartFromInfluxDB(attrs map[string]interface{}) (*Smart, error) {
100102
return &sm, nil
101103
}
102104

103-
//Parse Collector SMART data results and create Smart object (and associated SmartAtaAttribute entries)
104-
func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) error {
105+
// Parse Collector SMART data results and create Smart object (and associated SmartAtaAttribute entries)
106+
func (sm *Smart) FromCollectorSmartInfo(cfg config.Interface, wwn string, info collector.SmartInfo) error {
105107
sm.DeviceWWN = wwn
106108
sm.Date = time.Unix(info.LocalTime.TimeT, 0)
107109

@@ -117,7 +119,7 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
117119
// process ATA/NVME/SCSI protocol data
118120
sm.Attributes = map[string]SmartAttribute{}
119121
if sm.DeviceProtocol == pkg.DeviceProtocolAta {
120-
sm.ProcessAtaSmartInfo(info.AtaSmartAttributes.Table)
122+
sm.ProcessAtaSmartInfo(cfg, info.AtaSmartAttributes.Table)
121123
} else if sm.DeviceProtocol == pkg.DeviceProtocolNvme {
122124
sm.ProcessNvmeSmartInfo(info.NvmeSmartHealthInformationLog)
123125
} else if sm.DeviceProtocol == pkg.DeviceProtocolScsi {
@@ -127,8 +129,8 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
127129
return nil
128130
}
129131

130-
//generate SmartAtaAttribute entries from Scrutiny Collector Smart data.
131-
func (sm *Smart) ProcessAtaSmartInfo(tableItems []collector.AtaSmartAttributesTableItem) {
132+
// generate SmartAtaAttribute entries from Scrutiny Collector Smart data.
133+
func (sm *Smart) ProcessAtaSmartInfo(cfg config.Interface, tableItems []collector.AtaSmartAttributesTableItem) {
132134
for _, collectorAttr := range tableItems {
133135
attrModel := SmartAtaAttribute{
134136
AttributeId: collectorAttr.ID,
@@ -149,13 +151,27 @@ func (sm *Smart) ProcessAtaSmartInfo(tableItems []collector.AtaSmartAttributesTa
149151
attrModel.PopulateAttributeStatus()
150152
sm.Attributes[strconv.Itoa(collectorAttr.ID)] = &attrModel
151153

154+
var transient bool
155+
156+
if cfg != nil {
157+
transients := cfg.GetIntSlice("failures.transient.ata")
158+
for i := range transients {
159+
if collectorAttr.ID == transients[i] {
160+
transient = true
161+
break
162+
}
163+
}
164+
}
165+
152166
if pkg.AttributeStatusHas(attrModel.Status, pkg.AttributeStatusFailedScrutiny) {
153-
sm.Status = pkg.DeviceStatusSet(sm.Status, pkg.DeviceStatusFailedScrutiny)
167+
if !transient {
168+
sm.Status = pkg.DeviceStatusSet(sm.Status, pkg.DeviceStatusFailedScrutiny)
169+
}
154170
}
155171
}
156172
}
157173

158-
//generate SmartNvmeAttribute entries from Scrutiny Collector Smart data.
174+
// generate SmartNvmeAttribute entries from Scrutiny Collector Smart data.
159175
func (sm *Smart) ProcessNvmeSmartInfo(nvmeSmartHealthInformationLog collector.NvmeSmartHealthInformationLog) {
160176

161177
sm.Attributes = map[string]SmartAttribute{
@@ -185,7 +201,7 @@ func (sm *Smart) ProcessNvmeSmartInfo(nvmeSmartHealthInformationLog collector.Nv
185201
}
186202
}
187203

188-
//generate SmartScsiAttribute entries from Scrutiny Collector Smart data.
204+
// generate SmartScsiAttribute entries from Scrutiny Collector Smart data.
189205
func (sm *Smart) ProcessScsiSmartInfo(defectGrownList int64, scsiErrorCounterLog collector.ScsiErrorCounterLog) {
190206
sm.Attributes = map[string]SmartAttribute{
191207
"scsi_grown_defect_list": (&SmartScsiAttribute{AttributeId: "scsi_grown_defect_list", Value: defectGrownList, Threshold: 0}).PopulateAttributeStatus(),

0 commit comments

Comments
 (0)