snapraid/cmdline/check.c
2020-09-11 13:42:22 +02:00

2074 lines
68 KiB
C

/*
* Copyright (C) 2011 Andrea Mazzoleni
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "portable.h"
#include "support.h"
#include "util.h"
#include "elem.h"
#include "import.h"
#include "search.h"
#include "state.h"
#include "parity.h"
#include "handle.h"
#include "raid/raid.h"
#include "raid/combo.h"
/****************************************************************************/
/* check */
/**
* A block that failed the hash check, or that was deleted.
*/
struct failed_struct {
/**
* If we know for sure that the block is garbage or missing
* and it needs to be recovered and rewritten to the disk.
*/
int is_bad;
/**
* If that we have recovered may be not updated data,
* an old version, or just garbage.
*
* Essentially, it means that we are not sure what we have recovered
* is really correct. It's just our best guess.
*
* These "recovered" block are also written to the disk if the block is marked as ::is_bad.
* But these files are marked also as FILE_IS_DAMAGED, and then renamed to .unrecoverable.
*
* Note that this could happen only for CHG blocks.
*/
int is_outofdate;
unsigned index; /**< Index of the failed block. */
struct snapraid_block* block; /**< The failed block */
struct snapraid_disk* disk; /**< The failed disk. */
struct snapraid_file* file; /**< The failed file. 0 for DELETED block. */
block_off_t file_pos; /**< Offset inside the file */
struct snapraid_handle* handle; /**< The handle containing the failed block, or 0 for a DELETED block */
};
/**
* Check if a block hash matches the specified buffer.
* Return ==0 if equal
*/
static int blockcmp(struct snapraid_state* state, int rehash, struct snapraid_block* block, unsigned pos_size, unsigned char* buffer, unsigned char* buffer_zero)
{
unsigned char hash[HASH_MAX];
/* now compute the hash of the valid part */
if (rehash) {
memhash(state->prevhash, state->prevhashseed, hash, buffer, pos_size);
} else {
memhash(state->hash, state->hashseed, hash, buffer, pos_size);
}
/* compare the hash */
if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
return -1;
}
/* compare to the end of the block */
if (pos_size < state->block_size) {
if (memcmp(buffer + pos_size, buffer_zero + pos_size, state->block_size - pos_size) != 0) {
return -1;
}
}
return 0;
}
/**
* Check if the hash of all the failed block we are expecting to recover are now matching.
*/
static int is_hash_matching(struct snapraid_state* state, int rehash, unsigned diskmax, struct failed_struct* failed, unsigned* failed_map, unsigned failed_count, void** buffer, void* buffer_zero)
{
unsigned j;
int hash_checked;
hash_checked = 0; /* keep track if we check at least one block */
/* check if the recovered blocks are OK */
for (j = 0; j < failed_count; ++j) {
/* if we are expected to recover this block */
if (!failed[failed_map[j]].is_outofdate
/* if the block has a hash to check */
&& block_has_updated_hash(failed[failed_map[j]].block)
) {
/* if a hash doesn't match, fail the check */
unsigned pos_size = file_block_size(failed[failed_map[j]].file, failed[failed_map[j]].file_pos, state->block_size);
if (blockcmp(state, rehash, failed[failed_map[j]].block, pos_size, buffer[failed[failed_map[j]].index], buffer_zero) != 0) {
log_tag("hash_error: Hash mismatch on entry %u\n", failed_map[j]);
return 0;
}
hash_checked = 1;
}
}
/* if nothing checked, we reject it */
/* note that we are excluding this case at upper level */
/* but checking again doesn't hurt */
if (!hash_checked) {
/* LCOV_EXCL_START */
return 0;
/* LCOV_EXCL_STOP */
}
/* if we checked something, and no block failed the check */
/* recompute all the redundancy information */
raid_gen(diskmax, state->level, state->block_size, buffer);
return 1;
}
/**
* Check if specified parity is now matching with a recomputed one.
*/
static int is_parity_matching(struct snapraid_state* state, unsigned diskmax, unsigned i, void** buffer, void** buffer_recov)
{
/* recompute parity, note that we don't need parity over i */
raid_gen(diskmax, i + 1, state->block_size, buffer);
/* if the recovered parity block matches */
if (memcmp(buffer[diskmax + i], buffer_recov[i], state->block_size) == 0) {
/* recompute all the redundancy information */
raid_gen(diskmax, state->level, state->block_size, buffer);
return 1;
}
return 0;
}
/**
* Repair errors.
* Return <0 if failure for missing strategy, >0 if data is wrong and we cannot rebuild correctly, 0 on success.
* If success, the parity are computed in the buffer variable.
*/
static int repair_step(struct snapraid_state* state, int rehash, unsigned pos, unsigned diskmax, struct failed_struct* failed, unsigned* failed_map, unsigned failed_count, void** buffer, void** buffer_recov, void* buffer_zero)
{
unsigned i, n;
int error;
int has_hash;
int id[LEV_MAX];
int ip[LEV_MAX];
/* no fix required, already checked at higher level, but just to be sure */
if (failed_count == 0) {
/* LCOV_EXCL_START */
/* recompute only the parity */
raid_gen(diskmax, state->level, state->block_size, buffer);
return 0;
/* LCOV_EXCL_STOP */
}
n = state->level;
error = 0;
/* setup vector of failed disk indexes */
for (i = 0; i < failed_count; ++i)
id[i] = failed[failed_map[i]].index;
/* check if there is at least a failed block that can be checked for correctness using the hash */
/* if there isn't, we have to sacrifice a parity block to check that the result is correct */
has_hash = 0;
for (i = 0; i < failed_count; ++i) {
/* if we are expected to recover this block */
if (!failed[failed_map[i]].is_outofdate
/* if the block has a hash to check */
&& block_has_updated_hash(failed[failed_map[i]].block)
)
has_hash = 1;
}
/* if we don't have a hash, but we have an extra parity */
/* (strictly-less failures than number of parities) */
if (!has_hash && failed_count < n) {
/* number of parity to use, one more to check the recovering */
unsigned r = failed_count + 1;
/* all combinations (r of n) parities */
combination_first(r, n, ip);
do {
/* if a parity is missing, do nothing */
for (i = 0; i < r; ++i) {
if (buffer_recov[ip[i]] == 0)
break;
}
if (i != r)
continue;
/* copy the parities to use, one less because the last is used for checking */
for (i = 0; i < r - 1; ++i)
memcpy(buffer[diskmax + ip[i]], buffer_recov[ip[i]], state->block_size);
/* recover using one less parity, the ip[r-1] one */
raid_data(r - 1, id, ip, diskmax, state->block_size, buffer);
/* use the remaining ip[r-1] parity to check the result */
if (is_parity_matching(state, diskmax, ip[r - 1], buffer, buffer_recov))
return 0;
/* log */
log_tag("parity_error:%u:", pos);
for (i = 0; i < r; ++i) {
if (i != 0)
log_tag("/");
log_tag("%s", lev_config_name(ip[i]));
}
log_tag(":parity: Parity mismatch\n");
++error;
} while (combination_next(r, n, ip));
}
/* if we have a hash, and enough parities */
/* (less-or-equal failures than number of parities) */
if (has_hash && failed_count <= n) {
/* number of parities to use equal at the number of failures */
unsigned r = failed_count;
/* all combinations (r of n) parities */
combination_first(r, n, ip);
do {
/* if a parity is missing, do nothing */
for (i = 0; i < r; ++i) {
if (buffer_recov[ip[i]] == 0)
break;
}
if (i != r)
continue;
/* copy the parities to use */
for (i = 0; i < r; ++i)
memcpy(buffer[diskmax + ip[i]], buffer_recov[ip[i]], state->block_size);
/* recover */
raid_data(r, id, ip, diskmax, state->block_size, buffer);
/* use the hash to check the result */
if (is_hash_matching(state, rehash, diskmax, failed, failed_map, failed_count, buffer, buffer_zero))
return 0;
/* log */
log_tag("parity_error:%u:", pos);
for (i = 0; i < r; ++i) {
if (i != 0)
log_tag("/");
log_tag("%s", lev_config_name(ip[i]));
}
log_tag(":hash: Hash mismatch\n");
++error;
} while (combination_next(r, n, ip));
}
/* return the number of failed attempts, or -1 if no strategy */
if (error)
return error;
log_tag("strategy_error:%u: No strategy to recover from %u failures with %u parity %s hash\n",
pos, failed_count, n, has_hash ? "with" : "without");
return -1;
}
static int repair(struct snapraid_state* state, int rehash, unsigned pos, unsigned diskmax, struct failed_struct* failed, unsigned* failed_map, unsigned failed_count, void** buffer, void** buffer_recov, void* buffer_zero)
{
int ret;
int error;
unsigned j;
int n;
int something_to_recover;
int something_unsynced;
char esc_buffer[ESC_MAX];
error = 0;
/* if nothing failed, just recompute the parity */
if (failed_count == 0) {
raid_gen(diskmax, state->level, state->block_size, buffer);
return 0;
}
/* logs the status */
for (j = 0; j < failed_count; ++j) {
const char* desc;
const char* hash;
const char* data;
struct snapraid_block* block = failed[j].block;
unsigned block_state = block_state_get(block);
switch (block_state) {
case BLOCK_STATE_DELETED : desc = "delete"; break;
case BLOCK_STATE_CHG : desc = "change"; break;
case BLOCK_STATE_REP : desc = "replace"; break;
case BLOCK_STATE_BLK : desc = "block"; break;
/* LCOV_EXCL_START */
default : desc = "unknown"; break;
/* LCOV_EXCL_STOP */
}
if (hash_is_invalid(block->hash)) {
hash = "lost";
} else if (hash_is_zero(block->hash)) {
hash = "zero";
} else {
hash = "known";
}
if (failed[j].is_bad)
data = "bad";
else
data = "good";
if (failed[j].file) {
struct snapraid_disk* disk = failed[j].disk;
struct snapraid_file* file = failed[j].file;
block_off_t file_pos = failed[j].file_pos;
log_tag("entry:%u:%s:%s:%s:%s:%s:%u:\n", j, desc, hash, data, disk->name, esc_tag(file->sub, esc_buffer), file_pos);
} else {
log_tag("entry:%u:%s:%s:%s:\n", j, desc, hash, data);
}
}
/* Here we have to try two different strategies to recover, because in case the 'sync' */
/* process is aborted, we don't know if the parity data is really updated just like after 'sync', */
/* or if it still represents the state before the 'sync'. */
/* Note that if the 'sync' ends normally, we don't have any DELETED, REP and CHG blocks */
/* and the two strategies are identical */
/* As first, we assume that the parity IS updated for the current state */
/* and that we are going to recover the state after the last 'sync'. */
/* In this case, parity contains info from BLK, REP and CHG blocks, */
/* but not for DELETED. */
/* We need to put in the recovering process only the bad blocks, because all the */
/* others already contains the correct data read from disk, and the parity is correctly computed for them. */
/* We are interested to recover BLK, REP and CHG blocks if they are marked as bad, */
/* but we are not interested in DELETED ones. */
n = 0;
something_to_recover = 0; /* keep track if there is at least one block to fix */
for (j = 0; j < failed_count; ++j) {
if (failed[j].is_bad) {
unsigned block_state = block_state_get(failed[j].block);
assert(block_state != BLOCK_STATE_DELETED); /* we cannot have bad DELETED blocks */
/* if we have the hash for it */
if ((block_state == BLOCK_STATE_BLK || block_state == BLOCK_STATE_REP)
/* try to fetch the block using the known hash */
&& (state_import_fetch(state, rehash, failed[j].block, buffer[failed[j].index]) == 0
|| state_search_fetch(state, rehash, failed[j].file, failed[j].file_pos, failed[j].block, buffer[failed[j].index]) == 0)
) {
/* we already have corrected it! */
log_tag("hash_import: Fixed entry %u\n", j);
} else {
/* otherwise try to recover it */
failed_map[n] = j;
++n;
/* we have something to try to recover */
something_to_recover = 1;
}
}
}
/* if nothing to fix */
if (!something_to_recover) {
log_tag("recover_sync:%u:%u: Skipped for already recovered\n", pos, n);
/* recompute only the parity */
raid_gen(diskmax, state->level, state->block_size, buffer);
return 0;
}
ret = repair_step(state, rehash, pos, diskmax, failed, failed_map, n, buffer, buffer_recov, buffer_zero);
if (ret == 0) {
/* reprocess the CHG blocks, for which we don't have a hash to check */
/* if they were BAD we have to use some heuristics to ensure that we have recovered */
/* the state after the sync. If unsure, we assume the worst case */
for (j = 0; j < failed_count; ++j) {
/* we take care only of BAD blocks we have to write back */
if (failed[j].is_bad) {
unsigned block_state = block_state_get(failed[j].block);
/* BLK and REP blocks are always OK, because at this point */
/* we have already checked their hash */
if (block_state != BLOCK_STATE_CHG) {
assert(block_state == BLOCK_STATE_BLK || block_state == BLOCK_STATE_REP);
continue;
}
/* for CHG blocks we have to 'guess' if they are correct or not */
/* if the hash is invalid we cannot check the result */
/* this could happen if we have lost this information */
/* after an aborted sync */
if (hash_is_invalid(failed[j].block->hash)) {
/* it may contain garbage */
failed[j].is_outofdate = 1;
log_tag("hash_unknown: Unknown hash on entry %u\n", j);
} else if (hash_is_zero(failed[j].block->hash)) {
/* if the block is not filled with 0, we are sure to have */
/* restored it to the state after the 'sync' */
/* instead, if the block is filled with 0, it could be either that the */
/* block after the sync is really filled by 0, or that */
/* we restored the block before the 'sync'. */
if (memcmp(buffer[failed[j].index], buffer_zero, state->block_size) == 0) {
/* it may contain garbage */
failed[j].is_outofdate = 1;
log_tag("hash_unknown: Maybe old zero on entry %u\n", j);
}
} else {
/* if the hash is different than the previous one, we are sure to have */
/* restored it to the state after the 'sync' */
/* instead, if the hash matches, it could be either that the */
/* block after the sync has this hash, or that */
/* we restored the block before the 'sync'. */
unsigned pos_size = file_block_size(failed[j].file, failed[j].file_pos, state->block_size);
if (blockcmp(state, rehash, failed[j].block, pos_size, buffer[failed[j].index], buffer_zero) == 0) {
/* it may contain garbage */
failed[j].is_outofdate = 1;
log_tag("hash_unknown: Maybe old data on entry %u\n", j);
}
}
}
}
return 0;
}
if (ret > 0)
error += ret;
if (ret < 0)
log_tag("recover_sync:%u:%u: Failed with no attempts\n", pos, n);
else
log_tag("recover_sync:%u:%u: Failed with %d attempts\n", pos, n, ret);
/* Now assume that the parity IS NOT updated at the current state, */
/* but still represent the state before the last 'sync' process. */
/* In this case, parity contains info from BLK, REP (old version), CHG (old version) and DELETED blocks, */
/* but not for REP (new version) and CHG (new version). */
/* We are interested to recover BLK ones marked as bad, */
/* but we are not interested to recover CHG (new version) and REP (new version) blocks, */
/* even if marked as bad, because we don't have parity for them and it's just impossible, */
/* and we are not interested to recover DELETED ones. */
n = 0;
something_to_recover = 0; /* keep track if there is at least one block to fix */
something_unsynced = 0; /* keep track if we have some unsynced info to process */
for (j = 0; j < failed_count; ++j) {
unsigned block_state = block_state_get(failed[j].block);
if (block_state == BLOCK_STATE_DELETED
|| block_state == BLOCK_STATE_CHG
|| block_state == BLOCK_STATE_REP
) {
/* If the block is CHG, REP or DELETED, we don't have the original content of block, */
/* and we must try to recover it. */
/* This apply to CHG and REP blocks even if they are not marked bad, */
/* because the parity is computed with old content, and not with the new one. */
/* Note that this recovering is done just to make possible to recover any other BLK one, */
/* we are not really interested in DELETED, CHG (old version) and REP (old version). */
something_unsynced = 1;
if (block_state == BLOCK_STATE_CHG
&& hash_is_zero(failed[j].block->hash)
) {
/* If the block was a ZERO block, restore it to the original 0 as before the 'sync' */
/* We do this to just allow recovering of other BLK ones */
memset(buffer[failed[j].index], 0, state->block_size);
/* note that from now the buffer is definitively lost */
/* we can do this only because it's the last retry of recovering */
/* try to fetch the old block using the old hash for CHG and DELETED blocks */
} else if ((block_state == BLOCK_STATE_CHG || block_state == BLOCK_STATE_DELETED)
&& hash_is_unique(failed[j].block->hash)
&& state_import_fetch(state, rehash, failed[j].block, buffer[failed[j].index]) == 0) {
/* note that from now the buffer is definitively lost */
/* we can do this only because it's the last retry of recovering */
} else {
/* otherwise try to recover it */
failed_map[n] = j;
++n;
/* note that we don't set something_to_recover, because we are */
/* not really interested to recover *only* old blocks. */
}
/* avoid to use the hash of this block to verify the recovering */
/* this applies to REP blocks because we are going to recover the old state */
/* and the REP hash represent the new one */
/* it also applies to CHG and DELETE blocks because we want to have */
/* a successful recovering only if a BLK one is matching */
failed[j].is_outofdate = 1;
} else if (failed[j].is_bad) {
/* If the block is bad we don't know its content, and we try to recover it */
/* At this point, we can have only BLK ones */
assert(block_state == BLOCK_STATE_BLK);
/* we have something we are interested to recover */
something_to_recover = 1;
/* we try to recover it */
failed_map[n] = j;
++n;
}
}
/* if nothing to fix, we just don't try */
/* if nothing unsynced we also don't retry, because it's the same try as before */
if (something_to_recover && something_unsynced) {
ret = repair_step(state, rehash, pos, diskmax, failed, failed_map, n, buffer, buffer_recov, buffer_zero);
if (ret == 0) {
/* reprocess the REP and CHG blocks, for which we have recovered and old state */
/* that we don't want to save into disk */
/* we have already marked them, but we redo it for logging */
for (j = 0; j < failed_count; ++j) {
/* we take care only of BAD blocks we have to write back */
if (failed[j].is_bad) {
unsigned block_state = block_state_get(failed[j].block);
if (block_state == BLOCK_STATE_CHG
|| block_state == BLOCK_STATE_REP
) {
/* mark that we have restored an old state */
/* and we don't want to write it to the disk */
failed[j].is_outofdate = 1;
log_tag("hash_unknown: Surely old data on entry %u\n", j);
}
}
}
return 0;
}
if (ret > 0)
error += ret;
if (ret < 0)
log_tag("recover_unsync:%u:%u: Failed with no attempts\n", pos, n);
else
log_tag("recover_unsync:%u:%u: Failed with %d attempts\n", pos, n, ret);
} else {
log_tag("recover_unsync:%u:%u: Skipped for%s%s\n", pos, n,
!something_to_recover ? " nothing to recover" : "",
!something_unsynced ? " nothing unsynced" : ""
);
}
/* return the number of failed attempts, or -1 if no strategy */
if (error)
return error;
else
return -1;
}
/**
* Post process all the files at the specified block index ::i.
* For each file, if we are at the last block, closes it,
* adjust the timestamp, and print the result.
*
* This works with the assumption to always process the whole files to
* fix. This assumption is not always correct, and in such case we have to
* skip the whole postprocessing. And example, is when fixing only bad blocks.
*/
static int file_post(struct snapraid_state* state, int fix, unsigned i, struct snapraid_handle* handle, unsigned diskmax)
{
unsigned j;
int ret;
char esc_buffer[ESC_MAX];
char esc_buffer_alt[ESC_MAX];
/* if we are processing only bad blocks, we don't have to do any post-processing */
/* as we don't have any guarantee to process the last block of the fixed files */
if (state->opt.badonly)
return 0;
/* for all the files print the final status, and does the final time fix */
/* we also ensure to close files after processing the last block */
for (j = 0; j < diskmax; ++j) {
struct snapraid_block* block;
struct snapraid_disk* disk;
struct snapraid_file* collide_file;
struct snapraid_file* file;
block_off_t file_pos;
char path[PATH_MAX];
uint64_t inode;
disk = handle[j].disk;
if (!disk) {
/* if no disk, nothing to do */
continue;
}
block = fs_par2block_find(disk, i);
if (!block_has_file(block)) {
/* if no file, nothing to do */
continue;
}
file = fs_par2file_get(disk, i, &file_pos);
pathprint(path, sizeof(path), "%s%s", disk->dir, file->sub);
/* if it isn't the last block in the file */
if (!file_block_is_last(file, file_pos)) {
/* nothing to do */
continue;
}
/* if the file is excluded, we have nothing to adjust as the file is never written */
if (file_flag_has(file, FILE_IS_EXCLUDED)
|| (state->opt.syncedonly && file_flag_has(file, FILE_IS_UNSYNCED))) {
/* nothing to do, but close the file */
goto close_and_continue;
}
/* finish the fix process if it's the last block of the files */
if (fix) {
/* mark that we finished with this file */
/* to identify later any NOT finished ones */
file_flag_set(file, FILE_IS_FINISHED);
/* if the file is damaged, meaning that a fix failed */
if (file_flag_has(file, FILE_IS_DAMAGED)) {
/* rename it to .unrecoverable */
char path_to[PATH_MAX];
pathprint(path_to, sizeof(path_to), "%s%s.unrecoverable", disk->dir, file->sub);
/* ensure to close the file before renaming */
if (handle[j].file == file) {
ret = handle_close(&handle[j]);
if (ret != 0) {
/* LCOV_EXCL_START */
log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
log_fatal("DANGER! Unexpected close error in a data disk.\n");
return -1;
/* LCOV_EXCL_STOP */
}
}
ret = rename(path, path_to);
if (ret != 0) {
/* LCOV_EXCL_START */
log_fatal("Error renaming '%s' to '%s'. %s.\n", path, path_to, strerror(errno));
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
return -1;
/* LCOV_EXCL_STOP */
}
log_tag("status:unrecoverable:%s:%s\n", disk->name, esc_tag(file->sub, esc_buffer));
msg_info("unrecoverable %s\n", fmt_term(disk, file->sub, esc_buffer));
/* and do not set the time if damaged */
goto close_and_continue;
}
/* if the file is not fixed, meaning that it is untouched */
if (!file_flag_has(file, FILE_IS_FIXED)) {
/* nothing to do, but close the file */
goto close_and_continue;
}
/* if the file is closed or different than the one expected, reopen it */
/* a different open file could happen when filtering for bad blocks */
if (handle[j].file != file) {
/* close a potential different file */
ret = handle_close(&handle[j]);
if (ret != 0) {
/* LCOV_EXCL_START */
log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(handle[j].file->sub, esc_buffer), strerror(errno));
log_fatal("DANGER! Unexpected close error in a data disk.\n");
return -1;
/* LCOV_EXCL_STOP */
}
/* reopen it as readonly, as to set the mtime readonly access it's enough */
/* we know that the file exists because it has the FILE_IS_FIXED tag */
ret = handle_open(&handle[j], file, state->file_mode, log_error, 0);
if (ret != 0) {
/* LCOV_EXCL_START */
log_tag("error:%u:%s:%s: Open error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
return -1;
/* LCOV_EXCL_STOP */
}
}
log_tag("status:recovered:%s:%s\n", disk->name, esc_tag(file->sub, esc_buffer));
msg_info("recovered %s\n", fmt_term(disk, file->sub, esc_buffer));
inode = handle[j].st.st_ino;
/* search for the corresponding inode */
collide_file = tommy_hashdyn_search(&disk->inodeset, file_inode_compare_to_arg, &inode, file_inode_hash(inode));
/* if the inode is already in the database and it refers at a different file name, */
/* we can fix the file time ONLY if the time and size allow to differentiate */
/* between the two files */
/* for example, suppose we delete a bunch of files with all the same size and time, */
/* when recreating them the inodes may be reused in a different order, */
/* and at the next sync some files may have matching inode/size/time even if different name */
/* not allowing sync to detect that the file is changed and not renamed */
if (!collide_file /* if not in the database, there is no collision */
|| strcmp(collide_file->sub, file->sub) == 0 /* if the name is the same, it's the right collision */
|| collide_file->size != file->size /* if the size is different, the collision is identified */
|| collide_file->mtime_sec != file->mtime_sec /* if the mtime is different, the collision is identified */
|| collide_file->mtime_nsec != file->mtime_nsec /* same for mtime_nsec */
) {
/* set the original modification time */
ret = handle_utime(&handle[j]);
if (ret == -1) {
/* LCOV_EXCL_START */
/* mark the file as damaged */
file_flag_set(file, FILE_IS_DAMAGED);
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
return -1;
/* LCOV_EXCL_STOP */
}
} else {
log_tag("collision:%s:%s:%s: Not setting modification time to avoid inode collision\n", disk->name, esc_tag(file->sub, esc_buffer), esc_tag(collide_file->sub, esc_buffer_alt));
}
} else {
/* we are not fixing, but only checking */
/* print just the final status */
if (file_flag_has(file, FILE_IS_DAMAGED)) {
if (state->opt.auditonly) {
log_tag("status:damaged:%s:%s\n", disk->name, esc_tag(file->sub, esc_buffer));
msg_info("damaged %s\n", fmt_term(disk, file->sub, esc_buffer));
} else {
log_tag("status:unrecoverable:%s:%s\n", disk->name, esc_tag(file->sub, esc_buffer));
msg_info("unrecoverable %s\n", fmt_term(disk, file->sub, esc_buffer));
}
} else if (file_flag_has(file, FILE_IS_FIXED)) {
log_tag("status:recoverable:%s:%s\n", disk->name, esc_tag(file->sub, esc_buffer));
msg_info("recoverable %s\n", fmt_term(disk, file->sub, esc_buffer));
} else {
/* we don't use msg_verbose() because it also goes into the log */
if (msg_level >= MSG_VERBOSE) {
log_tag("status:correct:%s:%s\n", disk->name, esc_tag(file->sub, esc_buffer));
msg_info("correct %s\n", fmt_term(disk, file->sub, esc_buffer));
}
}
}
close_and_continue:
/* if the opened file is the correct one, close it */
/* in case of excluded and fragmented files it's possible */
/* that the opened file is not the current one */
if (handle[j].file == file) {
/* ensure to close the file just after finishing with it */
/* to avoid to keep it open without any possible use */
ret = handle_close(&handle[j]);
if (ret != 0) {
/* LCOV_EXCL_START */
log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
log_fatal("DANGER! Unexpected close error in a data disk.\n");
return -1;
/* LCOV_EXCL_STOP */
}
}
}
return 0;
}
/**
* Check if we have to process the specified block index ::i.
*/
static int block_is_enabled(struct snapraid_state* state, block_off_t i, struct snapraid_handle* handle, unsigned diskmax)
{
snapraid_info info;
unsigned j;
unsigned l;
/* get block specific info */
info = info_get(&state->infoarr, i);
/* if we filter for only bad blocks */
if (state->opt.badonly) {
/* skip if this is not bad */
if (!info_get_bad(info))
return 0;
}
/* now apply the filters */
/* if a parity is not excluded, include all blocks, even unused ones */
for (l = 0; l < state->level; ++l) {
if (!state->parity[l].is_excluded_by_filter) {
return 1;
}
}
/* otherwise include only used blocks */
for (j = 0; j < diskmax; ++j) {
struct snapraid_block* block;
/* if no disk, nothing to check */
if (!handle[j].disk)
continue;
block = fs_par2block_find(handle[j].disk, i);
/* try to recover all files, even the ones without hash */
/* because in some cases we can recover also them */
if (block_has_file(block)) {
struct snapraid_file* file = fs_par2file_get(handle[j].disk, i, 0);
if (!file_flag_has(file, FILE_IS_EXCLUDED)) { /* only if the file is not filtered out */
return 1;
}
}
}
return 0;
}
static int state_check_process(struct snapraid_state* state, int fix, struct snapraid_parity_handle** parity, block_off_t blockstart, block_off_t blockmax)
{
struct snapraid_handle* handle;
unsigned diskmax;
block_off_t i;
unsigned j;
void* buffer_alloc;
void** buffer;
unsigned buffermax;
int ret;
data_off_t countsize;
block_off_t countpos;
block_off_t countmax;
unsigned error;
unsigned unrecoverable_error;
unsigned recovered_error;
struct failed_struct* failed;
unsigned* failed_map;
unsigned l;
char esc_buffer[ESC_MAX];
char esc_buffer_alt[ESC_MAX];
handle = handle_mapping(state, &diskmax);
/* we need 1 * data + 2 * parity + 1 * zero */
buffermax = diskmax + 2 * state->level + 1;
buffer = malloc_nofail_vector_align(diskmax, buffermax, state->block_size, &buffer_alloc);
if (!state->opt.skip_self)
mtest_vector(buffermax, state->block_size, buffer);
/* fill up the zero buffer */
memset(buffer[buffermax - 1], 0, state->block_size);
raid_zero(buffer[buffermax - 1]);
failed = malloc_nofail(diskmax * sizeof(struct failed_struct));
failed_map = malloc_nofail(diskmax * sizeof(unsigned));
error = 0;
unrecoverable_error = 0;
recovered_error = 0;
/* first count the number of blocks to process */
countmax = 0;
for (i = blockstart; i < blockmax; ++i) {
if (!block_is_enabled(state, i, handle, diskmax))
continue;
++countmax;
}
/* check all the blocks in files */
countsize = 0;
countpos = 0;
state_progress_begin(state, blockstart, blockmax, countmax);
for (i = blockstart; i < blockmax; ++i) {
unsigned failed_count;
int valid_parity;
int used_parity;
snapraid_info info;
int rehash;
if (!block_is_enabled(state, i, handle, diskmax)) {
/* post process the files */
ret = file_post(state, fix, i, handle, diskmax);
if (ret == -1) {
/* LCOV_EXCL_START */
log_fatal("Stopping at block %u\n", i);
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* and now continue with the next block */
continue;
}
/* If we have valid parity, and it makes sense to check its content. */
/* If we already know that the parity is invalid, we just read the file */
/* but we don't report parity errors */
/* Note that with auditonly, we anyway skip the full parity check, */
/* because we also don't read it at all */
valid_parity = 1;
/* If the parity is used by at least one file */
used_parity = 0;
/* keep track of the number of failed blocks */
failed_count = 0;
/* get block specific info */
info = info_get(&state->infoarr, i);
/* if we have to use the old hash */
rehash = info_get_rehash(info);
/* for each disk, process the block */
for (j = 0; j < diskmax; ++j) {
int read_size;
unsigned char hash[HASH_MAX];
struct snapraid_disk* disk;
struct snapraid_block* block;
struct snapraid_file* file;
block_off_t file_pos;
unsigned block_state;
/* if the disk position is not used */
disk = handle[j].disk;
if (!disk) {
/* use an empty block */
memset(buffer[j], 0, state->block_size);
continue;
}
/* if the disk block is not used */
block = fs_par2block_find(disk, i);
if (block == BLOCK_NULL) {
/* use an empty block */
memset(buffer[j], 0, state->block_size);
continue;
}
/* get the state of the block */
block_state = block_state_get(block);
/* if the parity is not valid */
if (block_has_invalid_parity(block)) {
/* mark the parity as invalid, and don't try to check/fix it */
/* because it will be recomputed at the next sync */
valid_parity = 0;
/* follow */
}
/* if the block is DELETED */
if (block_state == BLOCK_STATE_DELETED) {
/* use an empty block */
memset(buffer[j], 0, state->block_size);
/* store it in the failed set, because potentially */
/* the parity may be still computed with the previous content */
failed[failed_count].is_bad = 0; /* note that is_bad==0 <=> file==0 */
failed[failed_count].is_outofdate = 0;
failed[failed_count].index = j;
failed[failed_count].block = block;
failed[failed_count].disk = disk;
failed[failed_count].file = 0;
failed[failed_count].file_pos = 0;
failed[failed_count].handle = 0;
++failed_count;
continue;
}
/* here we are sure that the parity is used by a file */
used_parity = 1;
/* get the file of this block */
file = fs_par2file_get(disk, i, &file_pos);
/* if we are only hashing, we can skip excluded files and don't even read them */
if (state->opt.auditonly && file_flag_has(file, FILE_IS_EXCLUDED)) {
/* use an empty block */
/* in true, this is unnecessary, because we are not checking any parity */
/* but we keep it for completeness */
memset(buffer[j], 0, state->block_size);
continue;
}
/* if the file is closed or different than the current one */
if (handle[j].file == 0 || handle[j].file != file) {
/* close the old one, if any */
ret = handle_close(&handle[j]);
if (ret == -1) {
/* LCOV_EXCL_START */
log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(handle[j].file->sub, esc_buffer), strerror(errno));
log_fatal("DANGER! Unexpected close error in a data disk.\n");
log_fatal("Stopping at block %u\n", i);
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* if fixing, and the file is not excluded, we must open for writing */
if (fix && !file_flag_has(file, FILE_IS_EXCLUDED)) {
/* if fixing, create the file, open for writing and resize if required */
ret = handle_create(&handle[j], file, state->file_mode);
if (ret == -1) {
/* LCOV_EXCL_START */
if (errno == EACCES) {
log_fatal("WARNING! Please give write permission to the file.\n");
} else {
log_fatal("DANGER! Without a working data disk, it isn't possible to fix errors on it.\n");
}
log_fatal("Stopping at block %u\n", i);
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* check if the file was just created */
if (handle[j].created != 0) {
/* if fragmented, it may be reopened, so remember that the file */
/* was originally missing */
file_flag_set(file, FILE_IS_CREATED);
}
} else {
/* open the file only for reading */
if (!file_flag_has(file, FILE_IS_MISSING))
ret = handle_open(&handle[j], file, state->file_mode,
log_error, state->opt.expected_missing ? log_expected : 0);
else
ret = -1; /* if the file is missing, we cannot open it */
if (ret == -1) {
/* save the failed block for the check/fix */
failed[failed_count].is_bad = 1;
failed[failed_count].is_outofdate = 0;
failed[failed_count].index = j;
failed[failed_count].block = block;
failed[failed_count].disk = disk;
failed[failed_count].file = file;
failed[failed_count].file_pos = file_pos;
failed[failed_count].handle = &handle[j];
++failed_count;
log_tag("error:%u:%s:%s: Open error at position %u\n", i, disk->name, esc_tag(file->sub, esc_buffer), file_pos);
++error;
/* mark the file as missing, to avoid to retry to open it again */
/* note that this can be done only if we are not fixing it */
/* otherwise, it could be recreated */
file_flag_set(file, FILE_IS_MISSING);
continue;
}
}
/* if it's the first open, and not excluded */
if (!file_flag_has(file, FILE_IS_OPENED)
&& !file_flag_has(file, FILE_IS_EXCLUDED)) {
/* check if the file is changed */
if (handle[j].st.st_size != file->size
|| handle[j].st.st_mtime != file->mtime_sec
|| STAT_NSEC(&handle[j].st) != file->mtime_nsec
/* don't check the inode to support file-system without persistent inodes */
) {
/* report that the file is not synced */
file_flag_set(file, FILE_IS_UNSYNCED);
}
}
/* if it's the first open, and not excluded and larger */
if (!file_flag_has(file, FILE_IS_OPENED)
&& !file_flag_has(file, FILE_IS_EXCLUDED)
&& !(state->opt.syncedonly && file_flag_has(file, FILE_IS_UNSYNCED))
&& handle[j].st.st_size > file->size
) {
log_error("File '%s' is larger than expected.\n", handle[j].path);
log_tag("error:%u:%s:%s: Size error\n", i, disk->name, esc_tag(file->sub, esc_buffer));
++error;
if (fix) {
ret = handle_truncate(&handle[j], file);
if (ret == -1) {
/* LCOV_EXCL_START */
log_fatal("DANGER! Unexpected truncate error in a data disk, it isn't possible to fix.\n");
log_fatal("Stopping at block %u\n", i);
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
log_tag("fixed:%u:%s:%s: Fixed size\n", i, disk->name, esc_tag(file->sub, esc_buffer));
++recovered_error;
}
}
/* mark the file as opened at least one time */
/* this is used to avoid to check the unsynced and size */
/* more than one time, in case the file is reopened later */
file_flag_set(file, FILE_IS_OPENED);
}
/* read from the file */
read_size = handle_read(&handle[j], file_pos, buffer[j], state->block_size,
log_error, state->opt.expected_missing ? log_expected : 0);
if (read_size == -1) {
/* save the failed block for the check/fix */
failed[failed_count].is_bad = 1; /* it's bad because we cannot read it */
failed[failed_count].is_outofdate = 0;
failed[failed_count].index = j;
failed[failed_count].block = block;
failed[failed_count].disk = disk;
failed[failed_count].file = file;
failed[failed_count].file_pos = file_pos;
failed[failed_count].handle = &handle[j];
++failed_count;
log_tag("error:%u:%s:%s: Read error at position %u\n", i, disk->name, esc_tag(file->sub, esc_buffer), file_pos);
++error;
continue;
}
countsize += read_size;
/* always insert CHG blocks, the repair functions needs all of them */
/* because the parity may be still referring at the old state */
/* and the repair must be aware of it */
if (block_state == BLOCK_STATE_CHG) {
/* we DO NOT mark them as bad to avoid to overwrite them with wrong data. */
/* if we don't have a hash, we always assume the first read of the block correct. */
failed[failed_count].is_bad = 0; /* we assume the CHG block correct */
failed[failed_count].is_outofdate = 0;
failed[failed_count].index = j;
failed[failed_count].block = block;
failed[failed_count].disk = disk;
failed[failed_count].file = file;
failed[failed_count].file_pos = file_pos;
failed[failed_count].handle = &handle[j];
++failed_count;
continue;
}
assert(block_state == BLOCK_STATE_BLK || block_state == BLOCK_STATE_REP);
/* compute the hash of the block just read */
if (rehash) {
memhash(state->prevhash, state->prevhashseed, hash, buffer[j], read_size);
} else {
memhash(state->hash, state->hashseed, hash, buffer[j], read_size);
}
/* compare the hash */
if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
/* save the failed block for the check/fix */
failed[failed_count].is_bad = 1; /* it's bad because the hash doesn't match */
failed[failed_count].is_outofdate = 0;
failed[failed_count].index = j;
failed[failed_count].block = block;
failed[failed_count].disk = disk;
failed[failed_count].file = file;
failed[failed_count].file_pos = file_pos;
failed[failed_count].handle = &handle[j];
++failed_count;
log_tag("error:%u:%s:%s: Data error at position %u, diff bits %u/%u\n", i, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
++error;
continue;
}
/* always insert REP blocks, the repair functions needs all of them */
/* because the parity may be still referring at the old state */
/* and the repair must be aware of it */
if (block_state == BLOCK_STATE_REP) {
failed[failed_count].is_bad = 0; /* it's not bad */
failed[failed_count].is_outofdate = 0;
failed[failed_count].index = j;
failed[failed_count].block = block;
failed[failed_count].disk = disk;
failed[failed_count].file = file;
failed[failed_count].file_pos = file_pos;
failed[failed_count].handle = &handle[j];
++failed_count;
continue;
}
}
/* now read and check the parity if requested */
if (!state->opt.auditonly) {
void* buffer_recov[LEV_MAX];
void* buffer_zero;
/* buffers for parity read and not computed */
for (l = 0; l < state->level; ++l)
buffer_recov[l] = buffer[diskmax + state->level + l];
for (; l < LEV_MAX; ++l)
buffer_recov[l] = 0;
/* the zero buffer is the last one */
buffer_zero = buffer[buffermax - 1];
/* read the parity */
for (l = 0; l < state->level; ++l) {
if (parity[l]) {
ret = parity_read(parity[l], i, buffer_recov[l], state->block_size, log_error);
if (ret == -1) {
buffer_recov[l] = 0; /* no parity to use */
log_tag("parity_error:%u:%s: Read error\n", i, lev_config_name(l));
++error;
}
} else {
buffer_recov[l] = 0;
}
}
/* try all the recovering strategies */
ret = repair(state, rehash, i, diskmax, failed, failed_map, failed_count, buffer, buffer_recov, buffer_zero);
if (ret != 0) {
/* increment the number of errors */
if (ret > 0)
error += ret;
++unrecoverable_error;
/* print a list of all the errors in files */
for (j = 0; j < failed_count; ++j) {
if (failed[j].is_bad)
log_tag("unrecoverable:%u:%s:%s: Unrecoverable error at position %u\n", i, failed[j].disk->name, esc_tag(failed[j].file->sub, esc_buffer), failed[j].file_pos);
}
/* keep track of damaged files */
for (j = 0; j < failed_count; ++j) {
if (failed[j].is_bad)
file_flag_set(failed[j].file, FILE_IS_DAMAGED);
}
} else {
/* now counts partial recovers */
/* note that this could happen only when we have an incomplete 'sync' */
/* and that we have recovered is the state before the 'sync' */
int partial_recover_error = 0;
/* print a list of all the errors in files */
for (j = 0; j < failed_count; ++j) {
if (failed[j].is_bad && failed[j].is_outofdate) {
++partial_recover_error;
log_tag("unrecoverable:%u:%s:%s: Unrecoverable unsynced error at position %u\n", i, failed[j].disk->name, esc_tag(failed[j].file->sub, esc_buffer), failed[j].file_pos);
}
}
if (partial_recover_error != 0) {
error += partial_recover_error;
++unrecoverable_error;
}
/*
* Check parities, but only if all the blocks have it computed and it's used.
*
* If you check/fix after a partial sync, it's OK to have parity errors
* on the blocks with invalid parity and doesn't make sense to try to fix it.
*
* It's also OK to have data errors on unused parity, because sync doesn't
* update it.
*/
if (used_parity && valid_parity) {
/* check the parity */
for (l = 0; l < state->level; ++l) {
if (buffer_recov[l] != 0 && memcmp(buffer_recov[l], buffer[diskmax + l], state->block_size) != 0) {
unsigned diff = memdiff(buffer_recov[l], buffer[diskmax + l], state->block_size);
/* mark that the read parity is wrong, setting ptr to 0 */
buffer_recov[l] = 0;
log_tag("parity_error:%u:%s: Data error, diff bits %u/%u\n", i, lev_config_name(l), diff, state->block_size * 8);
++error;
}
}
}
/* now write recovered files */
if (fix) {
/* update the fixed files */
for (j = 0; j < failed_count; ++j) {
/* nothing to do if it doesn't need recovering */
if (!failed[j].is_bad)
continue;
/* do not fix if the file is excluded */
if (file_flag_has(failed[j].file, FILE_IS_EXCLUDED)
|| (state->opt.syncedonly && file_flag_has(failed[j].file, FILE_IS_UNSYNCED)))
continue;
ret = handle_write(failed[j].handle, failed[j].file_pos, buffer[failed[j].index], state->block_size);
if (ret == -1) {
/* LCOV_EXCL_START */
/* mark the file as damaged */
file_flag_set(failed[j].file, FILE_IS_DAMAGED);
if (errno == EACCES) {
log_fatal("WARNING! Please give write permission to the file.\n");
} else {
/* we do not use DANGER because it could be ENOSPC which is not always correctly reported */
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
}
log_fatal("Stopping at block %u\n", i);
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* if we are not sure that the recovered content is uptodate */
if (failed[j].is_outofdate) {
/* mark the file as damaged */
file_flag_set(failed[j].file, FILE_IS_DAMAGED);
continue;
}
/* mark the file as containing some fixes */
/* note that it could be also marked as damaged in other iterations */
file_flag_set(failed[j].file, FILE_IS_FIXED);
log_tag("fixed:%u:%s:%s: Fixed data error at position %u\n", i, failed[j].disk->name, esc_tag(failed[j].file->sub, esc_buffer), failed[j].file_pos);
++recovered_error;
}
/*
* Update parity only if all the blocks have it computed and it's used.
*
* If you check/fix after a partial sync, you do not want to fix parity
* for blocks that are going to have it computed in the sync completion.
*
* For unused parity there is no need to write it, because when fixing
* we already have allocated space for it on parity file creation,
* and its content doesn't matter.
*/
if (used_parity && valid_parity) {
/* update the parity */
for (l = 0; l < state->level; ++l) {
/* if the parity on disk is wrong */
if (buffer_recov[l] == 0
/* and we have access at the parity */
&& parity[l] != 0
/* and the parity is not excluded */
&& !state->parity[l].is_excluded_by_filter
) {
ret = parity_write(parity[l], i, buffer[diskmax + l], state->block_size);
if (ret == -1) {
/* LCOV_EXCL_START */
/* we do not use DANGER because it could be ENOSPC which is not always correctly reported */
log_fatal("WARNING! Without a working %s disk, it isn't possible to fix errors on it.\n", lev_name(l));
log_fatal("Stopping at block %u\n", i);
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
log_tag("parity_fixed:%u:%s: Fixed data error\n", i, lev_config_name(l));
++recovered_error;
}
}
}
} else {
/* if we are not fixing, we just set the FIXED flag */
/* meaning that we could fix this file if we try */
for (j = 0; j < failed_count; ++j) {
if (failed[j].is_bad) {
file_flag_set(failed[j].file, FILE_IS_FIXED);
}
}
}
}
} else {
/* if we are not checking, we just set the DAMAGED flag */
/* to report that the file is damaged, and we don't know if we can fix it */
for (j = 0; j < failed_count; ++j) {
if (failed[j].is_bad) {
file_flag_set(failed[j].file, FILE_IS_DAMAGED);
}
}
}
/* post process the files */
ret = file_post(state, fix, i, handle, diskmax);
if (ret == -1) {
/* LCOV_EXCL_START */
log_fatal("Stopping at block %u\n", i);
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* count the number of processed block */
++countpos;
/* progress */
if (state_progress(state, 0, i, countpos, countmax, countsize)) {
/* LCOV_EXCL_START */
break;
/* LCOV_EXCL_STOP */
}
}
/* for each disk, recover empty files, symlinks and empty dirs */
for (i = 0; i < diskmax; ++i) {
tommy_node* node;
struct snapraid_disk* disk;
if (!handle[i].disk)
continue;
/* for each empty file in the disk */
disk = handle[i].disk;
node = disk->filelist;
while (node) {
char path[PATH_MAX];
struct stat st;
struct snapraid_file* file;
int unsuccessful = 0;
file = node->data;
node = node->next; /* next node */
/* if not empty, it's already checked and continue to the next one */
if (file->size != 0) {
continue;
}
/* if excluded continue to the next one */
if (file_flag_has(file, FILE_IS_EXCLUDED)) {
continue;
}
/* stat the file */
pathprint(path, sizeof(path), "%s%s", disk->dir, file->sub);
ret = stat(path, &st);
if (ret == -1) {
unsuccessful = 1;
log_error("Error stating empty file '%s'. %s.\n", path, strerror(errno));
log_tag("error:%s:%s: Empty file stat error\n", disk->name, esc_tag(file->sub, esc_buffer));
++error;
} else if (!S_ISREG(st.st_mode)) {
unsuccessful = 1;
log_tag("error:%s:%s: Empty file error for not regular file\n", disk->name, esc_tag(file->sub, esc_buffer));
++error;
} else if (st.st_size != 0) {
unsuccessful = 1;
log_tag("error:%s:%s: Empty file error for size '%" PRIu64 "'\n", disk->name, esc_tag(file->sub, esc_buffer), (uint64_t)st.st_size);
++error;
}
if (fix && unsuccessful) {
int f;
/* create the ancestor directories */
ret = mkancestor(path);
if (ret != 0) {
/* LCOV_EXCL_START */
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* create it */
/* O_NOFOLLOW: do not follow links to ensure to open the real file */
f = open(path, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_NOFOLLOW, 0600);
if (f == -1) {
/* LCOV_EXCL_START */
log_fatal("Error creating empty file '%s'. %s.\n", path, strerror(errno));
if (errno == EACCES) {
log_fatal("WARNING! Please give write permission to the file.\n");
} else {
/* we do not use DANGER because it could be ENOSPC which is not always correctly reported */
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
}
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* set the original modification time */
ret = fmtime(f, file->mtime_sec, file->mtime_nsec);
if (ret != 0) {
/* LCOV_EXCL_START */
close(f);
log_fatal("Error timing file '%s'. %s.\n", file->sub, strerror(errno));
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* close it */
ret = close(f);
if (ret != 0) {
/* LCOV_EXCL_START */
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
log_tag("fixed:%s:%s: Fixed empty file\n", disk->name, esc_tag(file->sub, esc_buffer));
++recovered_error;
log_tag("status:recovered:%s:%s\n", disk->name, esc_tag(file->sub, esc_buffer));
msg_info("recovered %s\n", fmt_term(disk, file->sub, esc_buffer));
}
}
/* for each link in the disk */
disk = handle[i].disk;
node = disk->linklist;
while (node) {
char path[PATH_MAX];
char pathto[PATH_MAX];
char linkto[PATH_MAX];
struct stat st;
struct stat stto;
struct snapraid_link* slink;
int unsuccessful = 0;
int unrecoverable = 0;
slink = node->data;
node = node->next; /* next node */
/* if excluded continue to the next one */
if (link_flag_has(slink, FILE_IS_EXCLUDED)) {
continue;
}
if (link_flag_has(slink, FILE_IS_HARDLINK)) {
/* stat the link */
pathprint(path, sizeof(path), "%s%s", disk->dir, slink->sub);
ret = stat(path, &st);
if (ret == -1) {
unsuccessful = 1;
log_error("Error stating hardlink '%s'. %s.\n", path, strerror(errno));
log_tag("hardlink_error:%s:%s:%s: Hardlink stat error\n", disk->name, esc_tag(slink->sub, esc_buffer), esc_tag(slink->linkto, esc_buffer_alt));
++error;
} else if (!S_ISREG(st.st_mode)) {
unsuccessful = 1;
log_tag("hardlink_error:%s:%s:%s: Hardlink error for not regular file\n", disk->name, esc_tag(slink->sub, esc_buffer), esc_tag(slink->linkto, esc_buffer_alt));
++error;
}
/* stat the "to" file */
pathprint(pathto, sizeof(pathto), "%s%s", disk->dir, slink->linkto);
ret = stat(pathto, &stto);
if (ret == -1) {
unsuccessful = 1;
if (errno == ENOENT) {
unrecoverable = 1;
if (fix) {
/* if the target doesn't exist, it's unrecoverable */
/* because we cannot create an hardlink of a file that */
/* doesn't exists */
++unrecoverable_error;
} else {
/* but in check, we can assume that fixing will recover */
/* such missing file, so we assume a less drastic error */
++error;
}
}
log_error("Error stating hardlink-to '%s'. %s.\n", pathto, strerror(errno));
log_tag("hardlink_error:%s:%s:%s: Hardlink to stat error\n", disk->name, esc_tag(slink->sub, esc_buffer), esc_tag(slink->linkto, esc_buffer_alt));
++error;
} else if (!S_ISREG(stto.st_mode)) {
unsuccessful = 1;
log_tag("hardlink_error:%s:%s:%s: Hardlink-to error for not regular file\n", disk->name, esc_tag(slink->sub, esc_buffer), esc_tag(slink->linkto, esc_buffer_alt));
++error;
} else if (!unsuccessful && st.st_ino != stto.st_ino) {
unsuccessful = 1;
log_error("Mismatch hardlink '%s' and '%s'. Different inode.\n", path, pathto);
log_tag("hardlink_error:%s:%s:%s: Hardlink mismatch for different inode\n", disk->name, esc_tag(slink->sub, esc_buffer), esc_tag(slink->linkto, esc_buffer_alt));
++error;
}
} else {
/* read the symlink */
pathprint(path, sizeof(path), "%s%s", disk->dir, slink->sub);
ret = readlink(path, linkto, sizeof(linkto));
if (ret < 0) {
unsuccessful = 1;
log_error("Error reading symlink '%s'. %s.\n", path, strerror(errno));
log_tag("symlink_error:%s:%s: Symlink read error\n", disk->name, esc_tag(slink->sub, esc_buffer));
++error;
} else if (ret >= PATH_MAX) {
unsuccessful = 1;
log_error("Error reading symlink '%s'. Symlink too long.\n", path);
log_tag("symlink_error:%s:%s: Symlink read error\n", disk->name, esc_tag(slink->sub, esc_buffer));
++error;
} else {
linkto[ret] = 0;
if (strcmp(linkto, slink->linkto) != 0) {
unsuccessful = 1;
log_tag("symlink_error:%s:%s: Symlink data error '%s' instead of '%s'\n", disk->name, esc_tag(slink->sub, esc_buffer), linkto, slink->linkto);
++error;
}
}
}
if (fix && unsuccessful && !unrecoverable) {
/* create the ancestor directories */
ret = mkancestor(path);
if (ret != 0) {
/* LCOV_EXCL_START */
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* if it exists, it must be deleted before recreating */
ret = remove(path);
if (ret != 0 && errno != ENOENT) {
/* LCOV_EXCL_START */
log_fatal("Error removing '%s'. %s.\n", path, strerror(errno));
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* create it */
if (link_flag_has(slink, FILE_IS_HARDLINK)) {
ret = hardlink(pathto, path);
if (ret != 0) {
/* LCOV_EXCL_START */
log_fatal("Error writing hardlink '%s' to '%s'. %s.\n", path, pathto, strerror(errno));
if (errno == EACCES) {
log_fatal("WARNING! Please give write permission to the hardlink.\n");
} else {
/* we do not use DANGER because it could be ENOSPC which is not always correctly reported */
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
}
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
log_tag("hardlink_fixed:%s:%s: Fixed hardlink error\n", disk->name, esc_tag(slink->sub, esc_buffer));
++recovered_error;
} else {
ret = symlink(slink->linkto, path);
if (ret != 0) {
/* LCOV_EXCL_START */
log_fatal("Error writing symlink '%s' to '%s'. %s.\n", path, slink->linkto, strerror(errno));
if (errno == EACCES) {
log_fatal("WARNING! Please give write permission to the symlink.\n");
} else {
/* we do not use DANGER because it could be ENOSPC which is not always correctly reported */
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
}
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
log_tag("symlink_fixed:%s:%s: Fixed symlink error\n", disk->name, esc_tag(slink->sub, esc_buffer));
++recovered_error;
}
log_tag("status:recovered:%s:%s\n", disk->name, esc_tag(slink->sub, esc_buffer));
msg_info("recovered %s\n", fmt_term(disk, slink->sub, esc_buffer));
}
}
/* for each dir in the disk */
disk = handle[i].disk;
node = disk->dirlist;
while (node) {
char path[PATH_MAX];
struct stat st;
struct snapraid_dir* dir;
int unsuccessful = 0;
dir = node->data;
node = node->next; /* next node */
/* if excluded continue to the next one */
if (dir_flag_has(dir, FILE_IS_EXCLUDED)) {
continue;
}
/* stat the dir */
pathprint(path, sizeof(path), "%s%s", disk->dir, dir->sub);
ret = stat(path, &st);
if (ret == -1) {
unsuccessful = 1;
log_error("Error stating dir '%s'. %s.\n", path, strerror(errno));
log_tag("dir_error:%s:%s: Dir stat error\n", disk->name, esc_tag(dir->sub, esc_buffer));
++error;
} else if (!S_ISDIR(st.st_mode)) {
unsuccessful = 1;
log_tag("dir_error:%s:%s: Dir error for not directory\n", disk->name, esc_tag(dir->sub, esc_buffer));
++error;
}
if (fix && unsuccessful) {
/* create the ancestor directories */
ret = mkancestor(path);
if (ret != 0) {
/* LCOV_EXCL_START */
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
/* create it */
ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
if (ret != 0) {
/* LCOV_EXCL_START */
log_fatal("Error creating dir '%s'. %s.\n", path, strerror(errno));
if (errno == EACCES) {
log_fatal("WARNING! Please give write permission to the dir.\n");
} else {
/* we do not use DANGER because it could be ENOSPC which is not always correctly reported */
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
}
log_fatal("Stopping\n");
++unrecoverable_error;
goto bail;
/* LCOV_EXCL_STOP */
}
log_tag("dir_fixed:%s:%s: Fixed dir error\n", disk->name, esc_tag(dir->sub, esc_buffer));
++recovered_error;
log_tag("status:recovered:%s:%s\n", disk->name, esc_tag(dir->sub, esc_buffer));
msg_info("recovered %s\n", fmt_term(disk, dir->sub, esc_buffer));
}
}
}
state_progress_end(state, countpos, countmax, countsize);
bail:
/* close all the files left open */
for (j = 0; j < diskmax; ++j) {
struct snapraid_file* file = handle[j].file;
struct snapraid_disk* disk = handle[j].disk;
ret = handle_close(&handle[j]);
if (ret == -1) {
/* LCOV_EXCL_START */
log_tag("error:%u:%s:%s: Close error. %s\n", blockmax, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
log_fatal("DANGER! Unexpected close error in a data disk.\n");
++unrecoverable_error;
/* continue, as we are already exiting */
/* LCOV_EXCL_STOP */
}
}
/* remove all the files created from scratch that have not finished the processing */
/* it happens only when aborting pressing Ctrl+C or other reason. */
if (fix) {
/* for each disk */
for (i = 0; i < diskmax; ++i) {
tommy_node* node;
struct snapraid_disk* disk;
if (!handle[i].disk)
continue;
/* for each file in the disk */
disk = handle[i].disk;
node = disk->filelist;
while (node) {
char path[PATH_MAX];
struct snapraid_file* file;
file = node->data;
node = node->next; /* next node */
/* if the file was not created, meaning that it was already existing */
if (!file_flag_has(file, FILE_IS_CREATED)) {
/* nothing to do */
continue;
}
/* if processing was finished */
if (file_flag_has(file, FILE_IS_FINISHED)) {
/* nothing to do */
continue;
}
/* if the file was originally missing, and processing not yet finished */
/* we have to throw it away to ensure that at the next run we will retry */
/* to fix it, in case we select to undelete missing files */
pathprint(path, sizeof(path), "%s%s", disk->dir, file->sub);
ret = remove(path);
if (ret != 0) {
/* LCOV_EXCL_START */
log_fatal("Error removing '%s'. %s.\n", path, strerror(errno));
log_fatal("WARNING! Without a working data disk, it isn't possible to fix errors on it.\n");
++unrecoverable_error;
/* continue, as we are already exiting */
/* LCOV_EXCL_STOP */
}
}
}
}
if (error || recovered_error || unrecoverable_error) {
msg_status("\n");
msg_status("%8u errors\n", error);
if (fix) {
msg_status("%8u recovered errors\n", recovered_error);
}
if (unrecoverable_error) {
msg_status("%8u UNRECOVERABLE errors\n", unrecoverable_error);
} else {
/* without checking, we don't know if they are really recoverable or not */
if (!state->opt.auditonly)
msg_status("%8u unrecoverable errors\n", unrecoverable_error);
if (fix)
msg_status("Everything OK\n");
}
} else {
msg_status("Everything OK\n");
}
if (error && !fix)
log_fatal("WARNING! There are errors!\n");
if (unrecoverable_error)
log_fatal("DANGER! There are unrecoverable errors!\n");
log_tag("summary:error:%u\n", error);
if (fix)
log_tag("summary:error_recovered:%u\n", recovered_error);
if (!state->opt.auditonly)
log_tag("summary:error_unrecoverable:%u\n", unrecoverable_error);
if (fix) {
if (error + recovered_error + unrecoverable_error == 0)
log_tag("summary:exit:ok\n");
else if (unrecoverable_error == 0)
log_tag("summary:exit:recovered\n");
else
log_tag("summary:exit:unrecoverable\n");
} else if (!state->opt.auditonly) {
if (error + unrecoverable_error == 0)
log_tag("summary:exit:ok\n");
else if (unrecoverable_error == 0)
log_tag("summary:exit:recoverable\n");
else
log_tag("summary:exit:unrecoverable\n");
} else { /* audit only */
if (error == 0)
log_tag("summary:exit:ok\n");
else
log_tag("summary:exit:error\n");
}
log_flush();
free(failed);
free(failed_map);
free(handle);
free(buffer_alloc);
free(buffer);
/* fail if some error are present after the run */
if (fix) {
if (state->opt.expect_unrecoverable) {
if (unrecoverable_error == 0)
return -1;
} else {
if (unrecoverable_error != 0)
return -1;
}
} else {
if (state->opt.expect_unrecoverable) {
if (unrecoverable_error == 0)
return -1;
} else if (state->opt.expect_recoverable) {
if (unrecoverable_error != 0 || error == 0)
return -1;
} else {
if (error != 0 || unrecoverable_error != 0)
return -1;
}
}
return 0;
}
int state_check(struct snapraid_state* state, int fix, block_off_t blockstart, block_off_t blockcount)
{
block_off_t blockmax;
data_off_t size;
int ret;
struct snapraid_parity_handle parity[LEV_MAX];
struct snapraid_parity_handle* parity_ptr[LEV_MAX];
unsigned error;
unsigned l;
msg_progress("Initializing...\n");
blockmax = parity_allocated_size(state);
size = blockmax * (data_off_t)state->block_size;
if (blockstart > blockmax) {
/* LCOV_EXCL_START */
log_fatal("Error in the specified starting block %u. It's bigger than the parity size %u.\n", blockstart, blockmax);
exit(EXIT_FAILURE);
/* LCOV_EXCL_STOP */
}
/* adjust the number of block to process */
if (blockcount != 0 && blockstart + blockcount < blockmax) {
blockmax = blockstart + blockcount;
}
if (fix) {
/* if fixing, create the file and open for writing */
/* if it fails, we cannot continue */
for (l = 0; l < state->level; ++l) {
/* skip parity disks that are not accessible */
if (state->parity[l].skip_access) {
parity_ptr[l] = 0;
continue;
}
parity_ptr[l] = &parity[l];
/* if the parity is excluded */
if (state->parity[l].is_excluded_by_filter) {
/* open for reading, and ignore error */
ret = parity_open(parity_ptr[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
if (ret == -1) {
/* continue anyway */
parity_ptr[l] = 0;
}
} else {
/* open for writing */
ret = parity_create(parity_ptr[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
if (ret == -1) {
/* LCOV_EXCL_START */
log_fatal("WARNING! Without an accessible %s file, it isn't possible to fix any error.\n", lev_name(l));
exit(EXIT_FAILURE);
/* LCOV_EXCL_STOP */
}
ret = parity_chsize(parity_ptr[l], &state->parity[l], 0, size, state->block_size, state->opt.skip_fallocate, state->opt.skip_space_holder);
if (ret == -1) {
/* LCOV_EXCL_START */
log_fatal("WARNING! Without an accessible %s file, it isn't possible to sync.\n", lev_name(l));
exit(EXIT_FAILURE);
/* LCOV_EXCL_STOP */
}
}
}
} else if (!state->opt.auditonly) {
/* if checking, open the file for reading */
/* it may fail if the file doesn't exist, in this case we continue to check the files */
for (l = 0; l < state->level; ++l) {
parity_ptr[l] = &parity[l];
ret = parity_open(parity_ptr[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
if (ret == -1) {
msg_status("No accessible %s file, only files will be checked.\n", lev_name(l));
/* continue anyway */
parity_ptr[l] = 0;
}
}
} else {
/* otherwise don't use any parity */
for (l = 0; l < state->level; ++l)
parity_ptr[l] = 0;
}
if (fix)
msg_progress("Fixing...\n");
else if (!state->opt.auditonly)
msg_progress("Checking...\n");
else
msg_progress("Hashing...\n");
error = 0;
/* skip degenerated cases of empty parity, or skipping all */
if (blockstart < blockmax) {
ret = state_check_process(state, fix, parity_ptr, blockstart, blockmax);
if (ret == -1) {
/* LCOV_EXCL_START */
++error;
/* continue, as we are already exiting */
/* LCOV_EXCL_STOP */
}
}
/* try to close only if opened */
for (l = 0; l < state->level; ++l) {
if (parity_ptr[l]) {
/* if fixing and not excluded, truncate parity not valid */
if (fix && !state->parity[l].is_excluded_by_filter) {
ret = parity_truncate(parity_ptr[l]);
if (ret == -1) {
/* LCOV_EXCL_START */
log_fatal("DANGER! Unexpected truncate error in %s disk.\n", lev_name(l));
++error;
/* continue, as we are already exiting */
/* LCOV_EXCL_STOP */
}
}
ret = parity_close(parity_ptr[l]);
if (ret == -1) {
/* LCOV_EXCL_START */
log_fatal("DANGER! Unexpected close error in %s disk.\n", lev_name(l));
++error;
/* continue, as we are already exiting */
/* LCOV_EXCL_STOP */
}
}
}
/* abort if error are present */
if (error != 0)
return -1;
return 0;
}