1611 lines
54 KiB
C
1611 lines
54 KiB
C
/*
|
|
* Copyright (C) 2011 Andrea Mazzoleni
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "portable.h"
|
|
|
|
#include "support.h"
|
|
#include "elem.h"
|
|
#include "state.h"
|
|
#include "parity.h"
|
|
#include "handle.h"
|
|
#include "io.h"
|
|
#include "raid/raid.h"
|
|
|
|
/****************************************************************************/
|
|
/* hash */
|
|
|
|
static int state_hash_process(struct snapraid_state* state, block_off_t blockstart, block_off_t blockmax, int* skip_sync)
|
|
{
|
|
struct snapraid_handle* handle;
|
|
unsigned diskmax;
|
|
block_off_t i;
|
|
unsigned j;
|
|
void* buffer;
|
|
void* buffer_alloc;
|
|
data_off_t countsize;
|
|
block_off_t countpos;
|
|
block_off_t countmax;
|
|
int ret;
|
|
unsigned error;
|
|
unsigned silent_error;
|
|
unsigned io_error;
|
|
char esc_buffer[ESC_MAX];
|
|
|
|
/* maps the disks to handles */
|
|
handle = handle_mapping(state, &diskmax);
|
|
|
|
/* buffer for reading */
|
|
buffer = malloc_nofail_direct(state->block_size, &buffer_alloc);
|
|
if (!state->opt.skip_self)
|
|
mtest_vector(1, state->block_size, &buffer);
|
|
|
|
error = 0;
|
|
silent_error = 0;
|
|
io_error = 0;
|
|
|
|
/* first count the number of blocks to process */
|
|
countmax = 0;
|
|
for (j = 0; j < diskmax; ++j) {
|
|
struct snapraid_disk* disk = handle[j].disk;
|
|
|
|
/* if no disk, nothing to check */
|
|
if (!disk)
|
|
continue;
|
|
|
|
for (i = blockstart; i < blockmax; ++i) {
|
|
struct snapraid_block* block;
|
|
unsigned block_state;
|
|
|
|
block = fs_par2block_find(disk, i);
|
|
|
|
/* get the state of the block */
|
|
block_state = block_state_get(block);
|
|
|
|
/* process REP and CHG blocks */
|
|
if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
|
|
continue;
|
|
|
|
++countmax;
|
|
}
|
|
}
|
|
|
|
/* drop until now */
|
|
state_usage_waste(state);
|
|
|
|
countsize = 0;
|
|
countpos = 0;
|
|
if (!state_progress_begin(state, blockstart, blockmax, countmax))
|
|
goto end;
|
|
|
|
for (j = 0; j < diskmax; ++j) {
|
|
struct snapraid_disk* disk = handle[j].disk;
|
|
|
|
/* if no disk, nothing to check */
|
|
if (!disk)
|
|
continue;
|
|
|
|
for (i = blockstart; i < blockmax; ++i) {
|
|
snapraid_info info;
|
|
int rehash;
|
|
struct snapraid_block* block;
|
|
int read_size;
|
|
unsigned char hash[HASH_MAX];
|
|
unsigned block_state;
|
|
struct snapraid_file* file;
|
|
block_off_t file_pos;
|
|
|
|
block = fs_par2block_find(disk, i);
|
|
|
|
/* get the state of the block */
|
|
block_state = block_state_get(block);
|
|
|
|
/* process REP and CHG blocks */
|
|
if (block_state != BLOCK_STATE_REP && block_state != BLOCK_STATE_CHG)
|
|
continue;
|
|
|
|
/* get the file of this block */
|
|
file = fs_par2file_get(disk, i, &file_pos);
|
|
|
|
/* get block specific info */
|
|
info = info_get(&state->infoarr, i);
|
|
|
|
/* if we have to use the old hash */
|
|
rehash = info_get_rehash(info);
|
|
|
|
/* until now is misc */
|
|
state_usage_misc(state);
|
|
|
|
/* if the file is different than the current one, close it */
|
|
if (handle[j].file != 0 && handle[j].file != file) {
|
|
/* keep a pointer at the file we are going to close for error reporting */
|
|
struct snapraid_file* report = handle[j].file;
|
|
ret = handle_close(&handle[j]);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
/* This one is really an unexpected error, because we are only reading */
|
|
/* and closing a descriptor should never fail */
|
|
if (errno == EIO) {
|
|
log_tag("error:%u:%s:%s: Close EIO error. %s\n", i, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
|
|
log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle[j].path);
|
|
log_fatal("Stopping at block %u\n", i);
|
|
++io_error;
|
|
goto bail;
|
|
}
|
|
|
|
log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
|
|
log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that file '%s' can be accessed.\n", handle[j].path);
|
|
log_fatal("Stopping at block %u\n", i);
|
|
++error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
|
|
ret = handle_open(&handle[j], file, state->file_mode, log_error, 0);
|
|
if (ret == -1) {
|
|
if (errno == EIO) {
|
|
/* LCOV_EXCL_START */
|
|
log_tag("error:%u:%s:%s: Open EIO error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
|
|
log_fatal("DANGER! Unexpected input/output open error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle[j].path);
|
|
log_fatal("Stopping at block %u\n", i);
|
|
++io_error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
if (errno == ENOENT) {
|
|
log_tag("error:%u:%s:%s: Open ENOENT error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
|
|
log_error("Missing file '%s'.\n", handle[j].path);
|
|
log_error("WARNING! You cannot modify data disk during a sync.\n");
|
|
log_error("Rerun the sync command when finished.\n");
|
|
++error;
|
|
/* if the file is missing, it means that it was removed during sync */
|
|
/* this isn't a serious error, so we skip this block, and continue with others */
|
|
continue;
|
|
}
|
|
|
|
if (errno == EACCES) {
|
|
log_tag("error:%u:%s:%s: Open EACCES error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
|
|
log_error("No access at file '%s'.\n", handle[j].path);
|
|
log_error("WARNING! Please fix the access permission in the data disk.\n");
|
|
log_error("Rerun the sync command when finished.\n");
|
|
++error;
|
|
/* this isn't a serious error, so we skip this block, and continue with others */
|
|
continue;
|
|
}
|
|
|
|
/* LCOV_EXCL_START */
|
|
log_tag("error:%u:%s:%s: Open error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
|
|
log_fatal("WARNING! Unexpected open error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that file '%s' can be accessed.\n", handle[j].path);
|
|
log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
|
|
++error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
/* check if the file is changed */
|
|
if (handle[j].st.st_size != file->size
|
|
|| handle[j].st.st_mtime != file->mtime_sec
|
|
|| STAT_NSEC(&handle[j].st) != file->mtime_nsec
|
|
|| handle[j].st.st_ino != file->inode
|
|
) {
|
|
log_tag("error:%u:%s:%s: Unexpected attribute change\n", i, disk->name, esc_tag(file->sub, esc_buffer));
|
|
if (handle[j].st.st_size != file->size) {
|
|
log_error("Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle[j].path, file->size, (uint64_t)handle[j].st.st_size);
|
|
} else if (handle[j].st.st_mtime != file->mtime_sec
|
|
|| STAT_NSEC(&handle[j].st) != file->mtime_nsec) {
|
|
log_error("Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle[j].path, file->mtime_sec, file->mtime_nsec, (uint64_t)handle[j].st.st_mtime, STAT_NSEC(&handle[j].st));
|
|
} else {
|
|
log_error("Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", file->inode, (uint64_t)handle[j].st.st_ino, handle[j].path);
|
|
}
|
|
log_error("WARNING! You cannot modify files during a sync.\n");
|
|
log_error("Rerun the sync command when finished.\n");
|
|
++error;
|
|
/* if the file is changed, it means that it was modified during sync */
|
|
/* this isn't a serious error, so we skip this block, and continue with others */
|
|
continue;
|
|
}
|
|
|
|
read_size = handle_read(&handle[j], file_pos, buffer, state->block_size, log_fatal, 0);
|
|
if (read_size == -1) {
|
|
/* LCOV_EXCL_START */
|
|
if (errno == EIO) {
|
|
log_tag("error:%u:%s:%s: Read EIO error at position %u. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), file_pos, strerror(errno));
|
|
log_fatal("DANGER! Unexpected input/output read error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that disk '%s' is sane and that file '%s' can be read.\n", disk->dir, handle[j].path);
|
|
log_fatal("Stopping at block %u\n", i);
|
|
++io_error;
|
|
goto bail;
|
|
}
|
|
|
|
log_tag("error:%u:%s:%s: Read error at position %u. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), file_pos, strerror(errno));
|
|
log_fatal("WARNING! Unexpected read error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that file '%s' can be read.\n", handle[j].path);
|
|
log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, file->sub, esc_buffer));
|
|
++error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
/* until now is disk */
|
|
state_usage_disk(state, handle, &j, 1);
|
|
|
|
state_usage_file(state, disk, file);
|
|
|
|
countsize += read_size;
|
|
|
|
/* now compute the hash */
|
|
if (rehash) {
|
|
memhash(state->prevhash, state->prevhashseed, hash, buffer, read_size);
|
|
} else {
|
|
memhash(state->hash, state->hashseed, hash, buffer, read_size);
|
|
}
|
|
|
|
/* until now is hash */
|
|
state_usage_hash(state);
|
|
|
|
if (block_state == BLOCK_STATE_REP) {
|
|
/* compare the hash */
|
|
if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
|
|
log_tag("error:%u:%s:%s: Unexpected data change\n", i, disk->name, esc_tag(file->sub, esc_buffer));
|
|
log_error("Data change at file '%s' at position '%u'\n", handle[j].path, file_pos);
|
|
log_error("WARNING! Unexpected data modification of a file without parity!\n");
|
|
|
|
if (file_flag_has(file, FILE_IS_COPY)) {
|
|
log_error("This file was detected as a copy of another file with the same name, size,\n");
|
|
log_error("and timestamp, but the file data isn't matching the assumed copy.\n");
|
|
log_error("If this is a false positive, and the files are expected to be different,\n");
|
|
log_error("you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
|
|
} else {
|
|
log_error("Try removing the file from the array and rerun the 'sync' command!\n");
|
|
}
|
|
|
|
/* block sync to allow a recovery before overwriting */
|
|
/* the parity needed to make such recovery */
|
|
*skip_sync = 1; /* avoid to run the next sync */
|
|
|
|
++silent_error;
|
|
continue;
|
|
}
|
|
} else {
|
|
/* the only other case is BLOCK_STATE_CHG */
|
|
assert(block_state == BLOCK_STATE_CHG);
|
|
|
|
/* copy the hash in the block */
|
|
memcpy(block->hash, hash, BLOCK_HASH_SIZE);
|
|
|
|
/* and mark the block as hashed */
|
|
block_state_set(block, BLOCK_STATE_REP);
|
|
|
|
/* mark the state as needing write */
|
|
state->need_write = 1;
|
|
}
|
|
|
|
/* count the number of processed block */
|
|
++countpos;
|
|
|
|
/* progress */
|
|
if (state_progress(state, 0, i, countpos, countmax, countsize)) {
|
|
/* LCOV_EXCL_START */
|
|
*skip_sync = 1; /* avoid to run the next sync */
|
|
break;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
|
|
/* close the last file in the disk */
|
|
if (handle[j].file != 0) {
|
|
/* keep a pointer at the file we are going to close for error reporting */
|
|
struct snapraid_file* report = handle[j].file;
|
|
ret = handle_close(&handle[j]);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
/* This one is really an unexpected error, because we are only reading */
|
|
/* and closing a descriptor should never fail */
|
|
if (errno == EIO) {
|
|
log_tag("error:%u:%s:%s: Close EIO error. %s\n", blockmax, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
|
|
log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle[j].path);
|
|
log_fatal("Stopping at block %u\n", blockmax);
|
|
++io_error;
|
|
goto bail;
|
|
}
|
|
|
|
log_tag("error:%u:%s:%s: Close error. %s\n", blockmax, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
|
|
log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that file '%s' can be accessed.\n", handle[j].path);
|
|
log_fatal("Stopping at block %u\n", blockmax);
|
|
++error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
}
|
|
|
|
end:
|
|
state_progress_end(state, countpos, countmax, countsize);
|
|
|
|
/* note that at this point no io_error is possible */
|
|
/* because at the first one we bail out */
|
|
assert(io_error == 0);
|
|
|
|
if (error || io_error || silent_error) {
|
|
msg_status("\n");
|
|
msg_status("%8u file errors\n", error);
|
|
msg_status("%8u io errors\n", io_error);
|
|
msg_status("%8u data errors\n", silent_error);
|
|
} else {
|
|
/* print the result only if processed something */
|
|
if (countpos != 0)
|
|
msg_status("Everything OK\n");
|
|
}
|
|
|
|
if (error)
|
|
log_fatal("WARNING! Unexpected file errors!\n");
|
|
|
|
log_tag("hash_summary:error_file:%u\n", error);
|
|
|
|
/* proceed without bailing out */
|
|
goto finish;
|
|
|
|
bail:
|
|
/* on bail, don't run the next sync */
|
|
*skip_sync = 1;
|
|
|
|
/* close files left open */
|
|
for (j = 0; j < diskmax; ++j) {
|
|
struct snapraid_file* file = handle[j].file;
|
|
struct snapraid_disk* disk = handle[j].disk;
|
|
ret = handle_close(&handle[j]);
|
|
if (ret == -1) {
|
|
log_tag("error:%u:%s:%s: Close error. %s\n", i, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
|
|
log_fatal("DANGER! Unexpected close error in a data disk.\n");
|
|
++error;
|
|
/* continue, as we are already exiting */
|
|
}
|
|
}
|
|
|
|
finish:
|
|
free(handle);
|
|
free(buffer_alloc);
|
|
|
|
if (error + io_error + silent_error != 0)
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
/****************************************************************************/
|
|
/* sync */
|
|
|
|
/**
|
|
* Sync plan to use.
|
|
*/
|
|
struct snapraid_plan {
|
|
unsigned handle_max;
|
|
struct snapraid_handle* handle_map;
|
|
int force_full;
|
|
};
|
|
|
|
/**
|
|
* A block that failed the hash check, or that was deleted.
|
|
*/
|
|
struct failed_struct {
|
|
unsigned index; /**< Index of the failed block. */
|
|
unsigned size; /**< Size of the block. */
|
|
|
|
struct snapraid_block* block; /**< The failed block, or BLOCK_DELETED for a deleted block */
|
|
};
|
|
|
|
/**
|
|
* Comparison function for sorting by index.
|
|
*/
|
|
int failed_compare_by_index(const void* void_a, const void* void_b)
|
|
{
|
|
const struct failed_struct* a = void_a;
|
|
const struct failed_struct* b = void_b;
|
|
|
|
if (a->index < b->index)
|
|
return -1;
|
|
if (a->index > b->index)
|
|
return 1;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Buffer for storing the new hashes.
|
|
*/
|
|
struct snapraid_rehash {
|
|
unsigned char hash[HASH_MAX];
|
|
struct snapraid_block* block;
|
|
};
|
|
|
|
/**
|
|
* Check if we have to process the specified block index ::i.
|
|
*/
|
|
static int block_is_enabled(void* void_plan, block_off_t i)
|
|
{
|
|
struct snapraid_plan* plan = void_plan;
|
|
unsigned j;
|
|
int one_invalid;
|
|
int one_valid;
|
|
|
|
/* for each disk */
|
|
one_invalid = 0;
|
|
one_valid = 0;
|
|
for (j = 0; j < plan->handle_max; ++j) {
|
|
struct snapraid_block* block;
|
|
struct snapraid_disk* disk = plan->handle_map[j].disk;
|
|
|
|
/* if no disk, nothing to check */
|
|
if (!disk)
|
|
continue;
|
|
|
|
block = fs_par2block_find(disk, i);
|
|
|
|
if (block_has_file(block))
|
|
one_valid = 1;
|
|
|
|
if (block_has_invalid_parity(block) || plan->force_full)
|
|
one_invalid = 1;
|
|
}
|
|
|
|
/* if none valid or none invalid, we don't need to update */
|
|
if (!one_invalid || !one_valid)
|
|
return 0;
|
|
|
|
return 1;
|
|
}
|
|
|
|
static void sync_data_reader(struct snapraid_worker* worker, struct snapraid_task* task)
|
|
{
|
|
struct snapraid_io* io = worker->io;
|
|
struct snapraid_state* state = io->state;
|
|
struct snapraid_handle* handle = worker->handle;
|
|
struct snapraid_disk* disk = handle->disk;
|
|
block_off_t blockcur = task->position;
|
|
unsigned char* buffer = task->buffer;
|
|
int ret;
|
|
char esc_buffer[ESC_MAX];
|
|
|
|
/* if the disk position is not used */
|
|
if (!disk) {
|
|
/* use an empty block */
|
|
memset(buffer, 0, state->block_size);
|
|
task->state = TASK_STATE_DONE;
|
|
return;
|
|
}
|
|
|
|
/* get the block */
|
|
task->block = fs_par2block_find(disk, blockcur);
|
|
|
|
/* if the block has no file, meaning that it's EMPTY or DELETED, */
|
|
/* it doesn't participate in the new parity computation */
|
|
if (!block_has_file(task->block)) {
|
|
/* use an empty block */
|
|
memset(buffer, 0, state->block_size);
|
|
task->state = TASK_STATE_DONE;
|
|
return;
|
|
}
|
|
|
|
/* get the file of this block */
|
|
task->file = fs_par2file_get(disk, blockcur, &task->file_pos);
|
|
|
|
/* if the file is different than the current one, close it */
|
|
if (handle->file != 0 && handle->file != task->file) {
|
|
/* keep a pointer at the file we are going to close for error reporting */
|
|
struct snapraid_file* report = handle->file;
|
|
ret = handle_close(handle);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
/* This one is really an unexpected error, because we are only reading */
|
|
/* and closing a descriptor should never fail */
|
|
if (errno == EIO) {
|
|
log_tag("error:%u:%s:%s: Close EIO error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
|
|
log_fatal("DANGER! Unexpected input/output close error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
task->state = TASK_STATE_IOERROR;
|
|
return;
|
|
}
|
|
|
|
log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(report->sub, esc_buffer), strerror(errno));
|
|
log_fatal("WARNING! Unexpected close error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that file '%s' can be accessed.\n", handle->path);
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
task->state = TASK_STATE_ERROR;
|
|
return;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
|
|
ret = handle_open(handle, task->file, state->file_mode, log_error, 0);
|
|
if (ret == -1) {
|
|
if (errno == EIO) {
|
|
/* LCOV_EXCL_START */
|
|
log_tag("error:%u:%s:%s: Open EIO error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
|
|
log_fatal("DANGER! Unexpected input/output open error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that disk '%s' is sane and that file '%s' can be accessed.\n", disk->dir, handle->path);
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
task->state = TASK_STATE_IOERROR;
|
|
return;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
if (errno == ENOENT) {
|
|
log_tag("error:%u:%s:%s: Open ENOENT error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
|
|
log_error("Missing file '%s'.\n", handle->path);
|
|
log_error("WARNING! You cannot modify data disk during a sync.\n");
|
|
log_error("Rerun the sync command when finished.\n");
|
|
/* if the file is missing, it means that it was removed during sync */
|
|
/* this isn't a serious error, so we skip this block, and continue with others */
|
|
task->state = TASK_STATE_ERROR_CONTINUE;
|
|
return;
|
|
}
|
|
|
|
if (errno == EACCES) {
|
|
log_tag("error:%u:%s:%s: Open EACCES error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
|
|
log_error("No access at file '%s'.\n", handle->path);
|
|
log_error("WARNING! Please fix the access permission in the data disk.\n");
|
|
log_error("Rerun the sync command when finished.\n");
|
|
/* this isn't a serious error, so we skip this block, and continue with others */
|
|
task->state = TASK_STATE_ERROR_CONTINUE;
|
|
return;
|
|
}
|
|
|
|
/* LCOV_EXCL_START */
|
|
log_tag("error:%u:%s:%s: Open error. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), strerror(errno));
|
|
log_fatal("WARNING! Unexpected open error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that file '%s' can be accessed.\n", handle->path);
|
|
log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
|
|
task->state = TASK_STATE_ERROR;
|
|
return;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
/* check if the file is changed */
|
|
if (handle->st.st_size != task->file->size
|
|
|| handle->st.st_mtime != task->file->mtime_sec
|
|
|| STAT_NSEC(&handle->st) != task->file->mtime_nsec
|
|
|| handle->st.st_ino != task->file->inode
|
|
) {
|
|
log_tag("error:%u:%s:%s: Unexpected attribute change\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer));
|
|
if (handle->st.st_size != task->file->size) {
|
|
log_error("Unexpected size change at file '%s' from %" PRIu64 " to %" PRIu64 ".\n", handle->path, task->file->size, (uint64_t)handle->st.st_size);
|
|
} else if (handle->st.st_mtime != task->file->mtime_sec
|
|
|| STAT_NSEC(&handle->st) != task->file->mtime_nsec) {
|
|
log_error("Unexpected time change at file '%s' from %" PRIu64 ".%d to %" PRIu64 ".%d.\n", handle->path, task->file->mtime_sec, task->file->mtime_nsec, (uint64_t)handle->st.st_mtime, STAT_NSEC(&handle->st));
|
|
} else {
|
|
log_error("Unexpected inode change from %" PRIu64 " to %" PRIu64 " at file '%s'.\n", task->file->inode, (uint64_t)handle->st.st_ino, handle->path);
|
|
}
|
|
log_error("WARNING! You cannot modify files during a sync.\n");
|
|
log_error("Rerun the sync command when finished.\n");
|
|
/* if the file is changed, it means that it was modified during sync */
|
|
/* this isn't a serious error, so we skip this block, and continue with others */
|
|
task->state = TASK_STATE_ERROR_CONTINUE;
|
|
return;
|
|
}
|
|
|
|
task->read_size = handle_read(handle, task->file_pos, buffer, state->block_size, log_error, 0);
|
|
if (task->read_size == -1) {
|
|
/* LCOV_EXCL_START */
|
|
if (errno == EIO) {
|
|
log_tag("error:%u:%s:%s: Read EIO error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
|
|
log_error("Input/Output error in file '%s' at position '%u'\n", handle->path, task->file_pos);
|
|
task->state = TASK_STATE_IOERROR_CONTINUE;
|
|
return;
|
|
}
|
|
|
|
log_tag("error:%u:%s:%s: Read error at position %u. %s\n", blockcur, disk->name, esc_tag(task->file->sub, esc_buffer), task->file_pos, strerror(errno));
|
|
log_fatal("WARNING! Unexpected read error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that file '%s' can be read.\n", handle->path);
|
|
log_fatal("Stopping to allow recovery. Try with 'snapraid check -f /%s'\n", fmt_poll(disk, task->file->sub, esc_buffer));
|
|
task->state = TASK_STATE_ERROR;
|
|
return;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
/* store the path of the opened file */
|
|
pathcpy(task->path, sizeof(task->path), handle->path);
|
|
|
|
task->state = TASK_STATE_DONE;
|
|
}
|
|
|
|
static void sync_parity_writer(struct snapraid_worker* worker, struct snapraid_task* task)
|
|
{
|
|
struct snapraid_io* io = worker->io;
|
|
struct snapraid_state* state = io->state;
|
|
struct snapraid_parity_handle* parity_handle = worker->parity_handle;
|
|
unsigned level = parity_handle->level;
|
|
block_off_t blockcur = task->position;
|
|
unsigned char* buffer = task->buffer;
|
|
int ret;
|
|
|
|
/* write parity */
|
|
ret = parity_write(parity_handle, blockcur, buffer, state->block_size);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
if (errno == EIO) {
|
|
log_tag("parity_error:%u:%s: Write EIO error. %s\n", blockcur, lev_config_name(level), strerror(errno));
|
|
log_error("Input/Output error in parity '%s' at position '%u'\n", lev_config_name(level), blockcur);
|
|
task->state = TASK_STATE_IOERROR_CONTINUE;
|
|
return;
|
|
}
|
|
|
|
log_tag("parity_error:%u:%s: Write error. %s\n", blockcur, lev_config_name(level), strerror(errno));
|
|
log_fatal("WARNING! Unexpected write error in the %s disk, it isn't possible to sync.\n", lev_name(level));
|
|
log_fatal("Ensure that disk '%s' has some free space available.\n", lev_config_name(level));
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
task->state = TASK_STATE_ERROR;
|
|
return;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
task->state = TASK_STATE_DONE;
|
|
}
|
|
|
|
static int state_sync_process(struct snapraid_state* state, struct snapraid_parity_handle* parity_handle, block_off_t blockstart, block_off_t blockmax)
|
|
{
|
|
struct snapraid_io io;
|
|
struct snapraid_plan plan;
|
|
struct snapraid_handle* handle;
|
|
void* rehandle_alloc;
|
|
struct snapraid_rehash* rehandle;
|
|
unsigned diskmax;
|
|
block_off_t blockcur;
|
|
unsigned j;
|
|
void* zero_alloc;
|
|
void** zero;
|
|
void* copy_alloc;
|
|
void** copy;
|
|
unsigned buffermax;
|
|
data_off_t countsize;
|
|
block_off_t countpos;
|
|
block_off_t countmax;
|
|
block_off_t autosavedone;
|
|
block_off_t autosavelimit;
|
|
block_off_t autosavemissing;
|
|
int ret;
|
|
unsigned error;
|
|
unsigned silent_error;
|
|
unsigned io_error;
|
|
time_t now;
|
|
struct failed_struct* failed;
|
|
int* failed_map;
|
|
unsigned l;
|
|
unsigned* waiting_map;
|
|
unsigned waiting_mac;
|
|
char esc_buffer[ESC_MAX];
|
|
|
|
/* the sync process assumes that all the hashes are correct */
|
|
/* including the ones from CHG and DELETED blocks */
|
|
assert(state->clear_past_hash != 0);
|
|
|
|
/* get the present time */
|
|
now = time(0);
|
|
|
|
/* maps the disks to handles */
|
|
handle = handle_mapping(state, &diskmax);
|
|
|
|
/* rehash buffers */
|
|
rehandle = malloc_nofail_align(diskmax * sizeof(struct snapraid_rehash), &rehandle_alloc);
|
|
|
|
/* we need 1 * data + 1 * parity */
|
|
buffermax = diskmax + state->level;
|
|
|
|
/* initialize the io threads */
|
|
io_init(&io, state, state->opt.io_cache, buffermax, sync_data_reader, handle, diskmax, 0, sync_parity_writer, parity_handle, state->level);
|
|
|
|
/* allocate the copy buffer */
|
|
copy = malloc_nofail_vector_align(diskmax, diskmax, state->block_size, ©_alloc);
|
|
|
|
/* allocate and fill the zero buffer */
|
|
zero = malloc_nofail_align(state->block_size, &zero_alloc);
|
|
memset(zero, 0, state->block_size);
|
|
raid_zero(zero);
|
|
|
|
failed = malloc_nofail(diskmax * sizeof(struct failed_struct));
|
|
failed_map = malloc_nofail(diskmax * sizeof(unsigned));
|
|
|
|
/* possibly waiting disks */
|
|
waiting_mac = diskmax > RAID_PARITY_MAX ? diskmax : RAID_PARITY_MAX;
|
|
waiting_map = malloc_nofail(waiting_mac * sizeof(unsigned));
|
|
|
|
error = 0;
|
|
silent_error = 0;
|
|
io_error = 0;
|
|
|
|
/* first count the number of blocks to process */
|
|
countmax = 0;
|
|
plan.handle_max = diskmax;
|
|
plan.handle_map = handle;
|
|
plan.force_full = state->opt.force_full;
|
|
for (blockcur = blockstart; blockcur < blockmax; ++blockcur) {
|
|
if (!block_is_enabled(&plan, blockcur))
|
|
continue;
|
|
++countmax;
|
|
}
|
|
|
|
/* compute the autosave size for all disk, even if not read */
|
|
/* this makes sense because the speed should be almost the same */
|
|
/* if the disks are read in parallel */
|
|
autosavelimit = state->autosave / (diskmax * state->block_size);
|
|
autosavemissing = countmax; /* blocks to do */
|
|
autosavedone = 0; /* blocks done */
|
|
|
|
/* drop until now */
|
|
state_usage_waste(state);
|
|
|
|
countsize = 0;
|
|
countpos = 0;
|
|
|
|
/* start all the worker threads */
|
|
io_start(&io, blockstart, blockmax, &block_is_enabled, &plan);
|
|
|
|
if (!state_progress_begin(state, blockstart, blockmax, countmax))
|
|
goto end;
|
|
|
|
while (1) {
|
|
unsigned failed_count;
|
|
int error_on_this_block;
|
|
int silent_error_on_this_block;
|
|
int io_error_on_this_block;
|
|
int fixed_error_on_this_block;
|
|
int parity_needs_to_be_updated;
|
|
int parity_going_to_be_updated;
|
|
snapraid_info info;
|
|
int rehash;
|
|
void** buffer;
|
|
int writer_error[IO_WRITER_ERROR_MAX];
|
|
|
|
/* go to the next block */
|
|
blockcur = io_read_next(&io, &buffer);
|
|
if (blockcur >= blockmax)
|
|
break;
|
|
|
|
/* until now is scheduling */
|
|
state_usage_sched(state);
|
|
|
|
/* one more block processed for autosave */
|
|
++autosavedone;
|
|
--autosavemissing;
|
|
|
|
/* by default process the block, and skip it if something goes wrong */
|
|
error_on_this_block = 0;
|
|
silent_error_on_this_block = 0;
|
|
io_error_on_this_block = 0;
|
|
fixed_error_on_this_block = 0;
|
|
|
|
/* keep track of the number of failed blocks */
|
|
failed_count = 0;
|
|
|
|
/* get block specific info */
|
|
info = info_get(&state->infoarr, blockcur);
|
|
|
|
/* if we have to use the old hash */
|
|
rehash = info_get_rehash(info);
|
|
|
|
/* if the parity requires to be updated */
|
|
/* It could happens that all the blocks are EMPTY/BLK and CHG but with the hash */
|
|
/* still matching because the specific CHG block was not modified. */
|
|
/* In such case, we can avoid to update parity, because it would be the same as before */
|
|
/* Note that CHG/DELETED blocks already present in the content file loaded */
|
|
/* have the hash cleared (::clear_past_hash flag), and then they won't never match the hash. */
|
|
/* We are treating only CHG blocks created at runtime. */
|
|
parity_needs_to_be_updated = state->opt.force_full || state->opt.force_parity_update;
|
|
|
|
/* if the parity is going to be updated */
|
|
parity_going_to_be_updated = 0;
|
|
|
|
/* if the block is marked as bad, we force the parity update */
|
|
/* because the bad block may be the result of a wrong parity */
|
|
if (info_get_bad(info))
|
|
parity_needs_to_be_updated = 1;
|
|
|
|
/* for each disk, process the block */
|
|
for (j = 0; j < diskmax; ++j) {
|
|
struct snapraid_task* task;
|
|
int read_size;
|
|
unsigned char hash[HASH_MAX];
|
|
struct snapraid_block* block;
|
|
unsigned block_state;
|
|
struct snapraid_disk* disk;
|
|
struct snapraid_file* file;
|
|
block_off_t file_pos;
|
|
unsigned diskcur;
|
|
|
|
/* until now is misc */
|
|
state_usage_misc(state);
|
|
|
|
task = io_data_read(&io, &diskcur, waiting_map, &waiting_mac);
|
|
|
|
/* until now is disk */
|
|
state_usage_disk(state, handle, waiting_map, waiting_mac);
|
|
|
|
/* get the results */
|
|
disk = task->disk;
|
|
block = task->block;
|
|
file = task->file;
|
|
file_pos = task->file_pos;
|
|
read_size = task->read_size;
|
|
|
|
/* by default no rehash in case of "continue" */
|
|
rehandle[diskcur].block = 0;
|
|
|
|
/* if the disk position is not used */
|
|
if (!disk)
|
|
continue;
|
|
|
|
state_usage_file(state, disk, file);
|
|
|
|
/* get the state of the block */
|
|
block_state = block_state_get(block);
|
|
|
|
/* if the block has invalid parity, */
|
|
/* we have to take care of it in case of recover */
|
|
if (block_has_invalid_parity(block)) {
|
|
/* store it in the failed set, because */
|
|
/* the parity may be still computed with the previous content */
|
|
failed[failed_count].index = diskcur;
|
|
failed[failed_count].size = state->block_size;
|
|
failed[failed_count].block = block;
|
|
++failed_count;
|
|
|
|
/* if the block has invalid parity, we have to update the parity */
|
|
/* to include this block change */
|
|
/* This also apply to CHG blocks, but we are going to handle */
|
|
/* later this case to do the updates only if really needed */
|
|
if (block_state != BLOCK_STATE_CHG)
|
|
parity_needs_to_be_updated = 1;
|
|
|
|
/* note that DELETE blocks are skipped in the next check */
|
|
/* and we have to store them in the failed blocks */
|
|
/* before skipping */
|
|
|
|
/* follow */
|
|
}
|
|
|
|
/* if the block is not used */
|
|
if (!block_has_file(block))
|
|
continue;
|
|
|
|
/* handle error conditions */
|
|
if (task->state == TASK_STATE_IOERROR) {
|
|
/* LCOV_EXCL_START */
|
|
++io_error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
if (task->state == TASK_STATE_ERROR) {
|
|
/* LCOV_EXCL_START */
|
|
++error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
if (task->state == TASK_STATE_ERROR_CONTINUE) {
|
|
++error;
|
|
error_on_this_block = 1;
|
|
continue;
|
|
}
|
|
if (task->state == TASK_STATE_IOERROR_CONTINUE) {
|
|
++io_error;
|
|
if (io_error >= state->opt.io_error_limit) {
|
|
/* LCOV_EXCL_START */
|
|
log_fatal("DANGER! Unexpected input/output read error in a data disk, it isn't possible to sync.\n");
|
|
log_fatal("Ensure that disk '%s' is sane and that file '%s' can be read.\n", disk->dir, task->path);
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
/* otherwise continue */
|
|
io_error_on_this_block = 1;
|
|
continue;
|
|
}
|
|
if (task->state != TASK_STATE_DONE) {
|
|
/* LCOV_EXCL_START */
|
|
log_fatal("Internal inconsistency in task state\n");
|
|
os_abort();
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
countsize += read_size;
|
|
|
|
/* now compute the hash */
|
|
if (rehash) {
|
|
memhash(state->prevhash, state->prevhashseed, hash, buffer[diskcur], read_size);
|
|
|
|
/* compute the new hash, and store it */
|
|
rehandle[diskcur].block = block;
|
|
memhash(state->hash, state->hashseed, rehandle[diskcur].hash, buffer[diskcur], read_size);
|
|
} else {
|
|
memhash(state->hash, state->hashseed, hash, buffer[diskcur], read_size);
|
|
}
|
|
|
|
/* until now is hash */
|
|
state_usage_hash(state);
|
|
|
|
if (block_has_updated_hash(block)) {
|
|
/* compare the hash */
|
|
if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
|
|
/* if the file has invalid parity, it's a REP changed during the sync */
|
|
if (block_has_invalid_parity(block)) {
|
|
log_tag("error:%u:%s:%s: Unexpected data change\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer));
|
|
log_error("Data change at file '%s' at position '%u'\n", task->path, file_pos);
|
|
log_error("WARNING! Unexpected data modification of a file without parity!\n");
|
|
|
|
if (file_flag_has(file, FILE_IS_COPY)) {
|
|
log_error("This file was detected as a copy of another file with the same name, size,\n");
|
|
log_error("and timestamp, but the file data isn't matching the assumed copy.\n");
|
|
log_error("If this is a false positive, and the files are expected to be different,\n");
|
|
log_error("you can 'sync' anyway using 'snapraid --force-nocopy sync'\n");
|
|
} else {
|
|
log_error("Try removing the file from the array and rerun the 'sync' command!\n");
|
|
}
|
|
|
|
++error;
|
|
|
|
/* if the file is changed, it means that it was modified during sync */
|
|
/* this isn't a serious error, so we skip this block, and continue with others */
|
|
error_on_this_block = 1;
|
|
continue;
|
|
} else { /* otherwise it's a BLK with silent error */
|
|
unsigned diff = memdiff(hash, block->hash, BLOCK_HASH_SIZE);
|
|
log_tag("error:%u:%s:%s: Data error at position %u, diff bits %u/%u\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), file_pos, diff, BLOCK_HASH_SIZE * 8);
|
|
log_error("Data error in file '%s' at position '%u', diff bits %u/%u\n", task->path, file_pos, diff, BLOCK_HASH_SIZE * 8);
|
|
|
|
/* save the failed block for the fix */
|
|
failed[failed_count].index = diskcur;
|
|
failed[failed_count].size = read_size;
|
|
failed[failed_count].block = block;
|
|
++failed_count;
|
|
|
|
/* silent errors are very rare, and are not a signal that a disk */
|
|
/* is going to fail. So, we just continue marking the block as bad */
|
|
/* just like in scrub */
|
|
++silent_error;
|
|
silent_error_on_this_block = 1;
|
|
continue;
|
|
}
|
|
}
|
|
} else {
|
|
/* if until now the parity doesn't need to be updated */
|
|
if (!parity_needs_to_be_updated) {
|
|
/* for sure it's a CHG block, because EMPTY are processed before with "continue" */
|
|
/* and BLK and REP have "block_has_updated_hash()" as 1, and all the others */
|
|
/* have "parity_needs_to_be_updated" already at 1 */
|
|
assert(block_state_get(block) == BLOCK_STATE_CHG);
|
|
|
|
/* if the hash represents the data unequivocally */
|
|
if (hash_is_unique(block->hash)) {
|
|
/* check if the hash is changed */
|
|
if (memcmp(hash, block->hash, BLOCK_HASH_SIZE) != 0) {
|
|
/* the block is different, and we must update parity */
|
|
parity_needs_to_be_updated = 1;
|
|
}
|
|
} else {
|
|
/* if the hash is already invalid, we update parity */
|
|
parity_needs_to_be_updated = 1;
|
|
}
|
|
}
|
|
|
|
/* copy the hash in the block, but doesn't mark the block as hashed */
|
|
/* this allow in case of skipped block to do not save the failed computation */
|
|
memcpy(block->hash, hash, BLOCK_HASH_SIZE);
|
|
|
|
/* note that in case of rehash, this is the wrong hash, */
|
|
/* but it will be overwritten later */
|
|
}
|
|
}
|
|
|
|
/* if we have only silent errors we can try to fix them on-the-fly */
|
|
/* note the the fix is not written to disk, but used only to */
|
|
/* compute the new parity */
|
|
if (!error_on_this_block && !io_error_on_this_block && silent_error_on_this_block) {
|
|
unsigned failed_mac;
|
|
int something_to_recover = 0;
|
|
|
|
/* sort the failed vector */
|
|
/* because with threads it may be in any order */
|
|
/* but RAID requires the indexes to be sorted */
|
|
qsort(failed, failed_count, sizeof(failed[0]), failed_compare_by_index);
|
|
|
|
/* setup the blocks to recover */
|
|
failed_mac = 0;
|
|
for (j = 0; j < failed_count; ++j) {
|
|
unsigned char* block_buffer = buffer[failed[j].index];
|
|
unsigned char* block_copy = copy[failed[j].index];
|
|
unsigned block_state = block_state_get(failed[j].block);
|
|
|
|
/* we try to recover only if at least one BLK is present */
|
|
if (block_state == BLOCK_STATE_BLK)
|
|
something_to_recover = 1;
|
|
|
|
/* save a copy of the content just read */
|
|
/* that it's going to be overwritten by the recovering function */
|
|
memcpy(block_copy, block_buffer, state->block_size);
|
|
|
|
if (block_state == BLOCK_STATE_CHG
|
|
&& hash_is_zero(failed[j].block->hash)
|
|
) {
|
|
/* if the block was filled with 0, restore this state */
|
|
/* and avoid to recover it */
|
|
memset(block_buffer, 0, state->block_size);
|
|
} else {
|
|
/* if we have too many failures, we cannot recover */
|
|
if (failed_mac >= state->level)
|
|
break;
|
|
|
|
/* otherwise it has to be recovered */
|
|
failed_map[failed_mac++] = failed[j].index;
|
|
}
|
|
}
|
|
|
|
/* if we have something to recover and enough parity */
|
|
if (something_to_recover && j == failed_count) {
|
|
/* until now is misc */
|
|
state_usage_misc(state);
|
|
|
|
/* read the parity */
|
|
/* we are sure that parity exists because */
|
|
/* we have at least one BLK block */
|
|
for (l = 0; l < state->level; ++l) {
|
|
ret = parity_read(&parity_handle[l], blockcur, buffer[diskmax + l], state->block_size, log_error);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
if (errno == EIO) {
|
|
log_tag("parity_error:%u:%s: Read EIO error. %s\n", blockcur, lev_config_name(l), strerror(errno));
|
|
if (io_error >= state->opt.io_error_limit) {
|
|
log_fatal("DANGER! Unexpected input/output read error in the %s disk, it isn't possible to sync.\n", lev_name(l));
|
|
log_fatal("Ensure that disk '%s' is sane and can be read.\n", lev_config_name(l));
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
++io_error;
|
|
goto bail;
|
|
}
|
|
|
|
log_error("Input/Output error in parity '%s' at position '%u'\n", lev_config_name(l), blockcur);
|
|
++io_error;
|
|
io_error_on_this_block = 1;
|
|
continue;
|
|
}
|
|
|
|
log_tag("parity_error:%u:%s: Read error. %s\n", blockcur, lev_config_name(l), strerror(errno));
|
|
log_fatal("WARNING! Unexpected read error in the %s disk, it isn't possible to sync.\n", lev_name(l));
|
|
log_fatal("Ensure that disk '%s' can be read.\n", lev_config_name(l));
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
++error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
/* until now is parity */
|
|
state_usage_parity(state, &l, 1);
|
|
}
|
|
|
|
/* if no error in parity read */
|
|
if (!io_error_on_this_block) {
|
|
/* try to fix the data */
|
|
/* note that this is a simple fix algorithm, that doesn't take into */
|
|
/* account the case of a wrong parity */
|
|
/* only 'fix' supports the most advanced fixing */
|
|
raid_rec(failed_mac, failed_map, diskmax, state->level, state->block_size, buffer);
|
|
|
|
/* until now is raid */
|
|
state_usage_raid(state);
|
|
|
|
/* check the result and prepare the data */
|
|
for (j = 0; j < failed_count; ++j) {
|
|
unsigned char hash[HASH_MAX];
|
|
unsigned char* block_buffer = buffer[failed[j].index];
|
|
unsigned char* block_copy = copy[failed[j].index];
|
|
unsigned block_state = block_state_get(failed[j].block);
|
|
|
|
if (block_state == BLOCK_STATE_BLK) {
|
|
unsigned size = failed[j].size;
|
|
|
|
/* compute the hash of the recovered block */
|
|
if (rehash) {
|
|
memhash(state->prevhash, state->prevhashseed, hash, block_buffer, size);
|
|
} else {
|
|
memhash(state->hash, state->hashseed, hash, block_buffer, size);
|
|
}
|
|
|
|
/* until now is hash */
|
|
state_usage_hash(state);
|
|
|
|
/* if the hash doesn't match */
|
|
if (memcmp(hash, failed[j].block->hash, BLOCK_HASH_SIZE) != 0) {
|
|
/* we have not recovered */
|
|
break;
|
|
}
|
|
|
|
/* pad with 0 if needed */
|
|
if (size < state->block_size)
|
|
memset(block_buffer + size, 0, state->block_size - size);
|
|
} else {
|
|
/* otherwise restore the content */
|
|
/* because we are not interested in the old state */
|
|
/* that it's recovered for CHG, REP and DELETED blocks */
|
|
memcpy(block_buffer, block_copy, state->block_size);
|
|
}
|
|
}
|
|
|
|
/* if all is processed, we have fixed it */
|
|
if (j == failed_count)
|
|
fixed_error_on_this_block = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* if we have read all the data required and it's correct, proceed with the parity */
|
|
if (!error_on_this_block && !io_error_on_this_block
|
|
&& (!silent_error_on_this_block || fixed_error_on_this_block)
|
|
) {
|
|
/* update the parity only if really needed */
|
|
if (parity_needs_to_be_updated) {
|
|
/* compute the parity */
|
|
raid_gen(diskmax, state->level, state->block_size, buffer);
|
|
|
|
/* until now is raid */
|
|
state_usage_raid(state);
|
|
|
|
/* mark that the parity is going to be written */
|
|
parity_going_to_be_updated = 1;
|
|
}
|
|
|
|
/* for each disk, mark the blocks as processed */
|
|
for (j = 0; j < diskmax; ++j) {
|
|
struct snapraid_block* block;
|
|
|
|
if (!handle[j].disk)
|
|
continue;
|
|
|
|
block = fs_par2block_find(handle[j].disk, blockcur);
|
|
|
|
if (block == BLOCK_NULL) {
|
|
/* nothing to do */
|
|
continue;
|
|
}
|
|
|
|
/* if it's a deleted block */
|
|
if (block_state_get(block) == BLOCK_STATE_DELETED) {
|
|
/* the parity is now updated without this block, so it's now empty */
|
|
fs_deallocate(handle[j].disk, blockcur);
|
|
continue;
|
|
}
|
|
|
|
/* now all the blocks have the hash and the parity computed */
|
|
block_state_set(block, BLOCK_STATE_BLK);
|
|
}
|
|
|
|
/* we update the info block only if we really have updated the parity */
|
|
/* because otherwise the time/justsynced info would be misleading as we didn't */
|
|
/* wrote the parity at this time */
|
|
/* we also update the info block only if no silent error was found */
|
|
/* because has no sense to refresh the time for data that we know bad */
|
|
if (parity_needs_to_be_updated
|
|
&& !silent_error_on_this_block
|
|
) {
|
|
/* if rehash is needed */
|
|
if (rehash) {
|
|
/* store all the new hash already computed */
|
|
for (j = 0; j < diskmax; ++j) {
|
|
if (rehandle[j].block)
|
|
memcpy(rehandle[j].block->hash, rehandle[j].hash, BLOCK_HASH_SIZE);
|
|
}
|
|
}
|
|
|
|
/* update the time info of the block */
|
|
/* we are also clearing any previous bad and rehash flag */
|
|
info_set(&state->infoarr, blockcur, info_make(now, 0, 0, 1));
|
|
}
|
|
}
|
|
|
|
/* if a silent (even if corrected) or input/output error was found */
|
|
/* mark the block as bad to have check/fix to handle it */
|
|
/* because our correction is in memory only and not yet written */
|
|
if (silent_error_on_this_block || io_error_on_this_block) {
|
|
/* set the error status keeping the other info */
|
|
info_set(&state->infoarr, blockcur, info_set_bad(info));
|
|
}
|
|
|
|
/* finally schedule parity write */
|
|
/* Note that the calls to io_parity_write() are mandatory */
|
|
/* even if the parity doesn't need to be updated */
|
|
/* This because we want to keep track of the time usage */
|
|
state_usage_misc(state);
|
|
|
|
/* write start */
|
|
io_write_preset(&io, blockcur, !parity_going_to_be_updated);
|
|
|
|
/* write the parity */
|
|
for (l = 0; l < state->level; ++l) {
|
|
unsigned levcur;
|
|
|
|
io_parity_write(&io, &levcur, waiting_map, &waiting_mac);
|
|
|
|
/* until now is parity */
|
|
state_usage_parity(state, waiting_map, waiting_mac);
|
|
}
|
|
|
|
/* write finished */
|
|
io_write_next(&io, blockcur, !parity_going_to_be_updated, writer_error);
|
|
|
|
/* handle errors reported */
|
|
for (j = 0; j < IO_WRITER_ERROR_MAX; ++j) {
|
|
if (writer_error[j]) {
|
|
switch (j + IO_WRITER_ERROR_BASE) {
|
|
case TASK_STATE_IOERROR_CONTINUE :
|
|
++io_error;
|
|
if (io_error >= state->opt.io_error_limit) {
|
|
/* LCOV_EXCL_START */
|
|
log_fatal("DANGER! Unexpected input/output write error in a parity disk, it isn't possible to sync.\n");
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
break;
|
|
case TASK_STATE_ERROR_CONTINUE :
|
|
++error;
|
|
break;
|
|
case TASK_STATE_IOERROR :
|
|
/* LCOV_EXCL_START */
|
|
++io_error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
case TASK_STATE_ERROR :
|
|
/* LCOV_EXCL_START */
|
|
++error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
}
|
|
|
|
/* mark the state as needing write */
|
|
state->need_write = 1;
|
|
|
|
/* count the number of processed block */
|
|
++countpos;
|
|
|
|
/* progress */
|
|
if (state_progress(state, &io, blockcur, countpos, countmax, countsize)) {
|
|
/* LCOV_EXCL_START */
|
|
break;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
/* autosave */
|
|
if ((state->autosave != 0
|
|
&& autosavedone >= autosavelimit /* if we have reached the limit */
|
|
&& autosavemissing >= autosavelimit) /* if we have at least a full step to do */
|
|
/* or if we have a forced autosave at the specified block */
|
|
|| (state->opt.force_autosave_at != 0 && state->opt.force_autosave_at == blockcur)
|
|
) {
|
|
autosavedone = 0; /* restart the counter */
|
|
|
|
/* until now is misc */
|
|
state_usage_misc(state);
|
|
|
|
state_progress_stop(state);
|
|
|
|
msg_progress("Autosaving...\n");
|
|
|
|
/* before writing the new content file we ensure that */
|
|
/* the parity is really written flushing the disk cache */
|
|
for (l = 0; l < state->level; ++l) {
|
|
ret = parity_sync(&parity_handle[l]);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
log_tag("parity_error:%u:%s: Sync error\n", blockcur, lev_config_name(l));
|
|
log_fatal("DANGER! Unexpected sync error in %s disk.\n", lev_name(l));
|
|
log_fatal("Ensure that disk '%s' is sane.\n", lev_config_name(l));
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
++error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
|
|
/* now we can safely write the content file */
|
|
state_write(state);
|
|
|
|
state_progress_restart(state);
|
|
|
|
/* drop until now */
|
|
state_usage_waste(state);
|
|
}
|
|
}
|
|
|
|
end:
|
|
state_progress_end(state, countpos, countmax, countsize);
|
|
|
|
state_usage_print(state);
|
|
|
|
/* before returning we ensure that */
|
|
/* the parity is really written flushing the disk cache */
|
|
for (l = 0; l < state->level; ++l) {
|
|
ret = parity_sync(&parity_handle[l]);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
log_tag("parity_error:%u:%s: Sync error\n", blockcur, lev_config_name(l));
|
|
log_fatal("DANGER! Unexpected sync error in %s disk.\n", lev_name(l));
|
|
log_fatal("Ensure that disk '%s' is sane.\n", lev_config_name(l));
|
|
log_fatal("Stopping at block %u\n", blockcur);
|
|
++error;
|
|
goto bail;
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
|
|
if (error || silent_error || io_error) {
|
|
msg_status("\n");
|
|
msg_status("%8u file errors\n", error);
|
|
msg_status("%8u io errors\n", io_error);
|
|
msg_status("%8u data errors\n", silent_error);
|
|
} else {
|
|
/* print the result only if processed something */
|
|
if (countpos != 0)
|
|
msg_status("Everything OK\n");
|
|
}
|
|
|
|
if (error)
|
|
log_fatal("WARNING! Unexpected file errors!\n");
|
|
if (io_error)
|
|
log_fatal("DANGER! Unexpected input/output errors! The failing blocks are now marked as bad!\n");
|
|
if (silent_error)
|
|
log_fatal("DANGER! Unexpected data errors! The failing blocks are now marked as bad!\n");
|
|
if (io_error || silent_error) {
|
|
log_fatal("Use 'snapraid status' to list the bad blocks.\n");
|
|
log_fatal("Use 'snapraid -e fix' to recover.\n");
|
|
}
|
|
|
|
log_tag("summary:error_file:%u\n", error);
|
|
log_tag("summary:error_io:%u\n", io_error);
|
|
log_tag("summary:error_data:%u\n", silent_error);
|
|
if (error + silent_error + io_error == 0)
|
|
log_tag("summary:exit:ok\n");
|
|
else
|
|
log_tag("summary:exit:error\n");
|
|
log_flush();
|
|
|
|
bail:
|
|
/* stop all the worker threads */
|
|
io_stop(&io);
|
|
|
|
for (j = 0; j < diskmax; ++j) {
|
|
struct snapraid_file* file = handle[j].file;
|
|
struct snapraid_disk* disk = handle[j].disk;
|
|
ret = handle_close(&handle[j]);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
log_tag("error:%u:%s:%s: Close error. %s\n", blockcur, disk->name, esc_tag(file->sub, esc_buffer), strerror(errno));
|
|
log_fatal("DANGER! Unexpected close error in a data disk.\n");
|
|
++error;
|
|
/* continue, as we are already exiting */
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
|
|
free(handle);
|
|
free(zero_alloc);
|
|
free(copy_alloc);
|
|
free(copy);
|
|
free(rehandle_alloc);
|
|
free(failed);
|
|
free(failed_map);
|
|
free(waiting_map);
|
|
io_done(&io);
|
|
|
|
if (state->opt.expect_recoverable) {
|
|
if (error + silent_error + io_error == 0)
|
|
return -1;
|
|
} else {
|
|
if (error + silent_error + io_error != 0)
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int state_sync(struct snapraid_state* state, block_off_t blockstart, block_off_t blockcount)
|
|
{
|
|
block_off_t blockmax;
|
|
block_off_t used_paritymax;
|
|
block_off_t file_paritymax;
|
|
data_off_t size;
|
|
int ret;
|
|
struct snapraid_parity_handle parity_handle[LEV_MAX];
|
|
unsigned unrecoverable_error;
|
|
unsigned l;
|
|
int skip_sync = 0;
|
|
|
|
msg_progress("Initializing...\n");
|
|
|
|
blockmax = parity_allocated_size(state);
|
|
size = blockmax * (data_off_t)state->block_size;
|
|
|
|
/* minimum size of the parity files we expect */
|
|
used_paritymax = parity_used_size(state);
|
|
|
|
/* effective size of the parity files */
|
|
file_paritymax = 0;
|
|
|
|
if (blockstart > blockmax) {
|
|
/* LCOV_EXCL_START */
|
|
log_fatal("Error in the starting block %u. It's bigger than the parity size %u.\n", blockstart, blockmax);
|
|
exit(EXIT_FAILURE);
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
/* adjust the number of block to process */
|
|
if (blockcount != 0 && blockstart + blockcount < blockmax) {
|
|
blockmax = blockstart + blockcount;
|
|
}
|
|
|
|
for (l = 0; l < state->level; ++l) {
|
|
data_off_t out_size;
|
|
block_off_t parityblocks;
|
|
|
|
/* create the file and open for writing */
|
|
ret = parity_create(&parity_handle[l], &state->parity[l], l, state->file_mode, state->block_size, state->opt.parity_limit_size);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
log_fatal("WARNING! Without an accessible %s file, it isn't possible to sync.\n", lev_name(l));
|
|
exit(EXIT_FAILURE);
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
/* number of block in the parity file */
|
|
parity_size(&parity_handle[l], &out_size);
|
|
parityblocks = out_size / state->block_size;
|
|
|
|
/* if the file is too small */
|
|
if (parityblocks < used_paritymax) {
|
|
log_fatal("WARNING! The %s parity has data only %u blocks instead of %u.\n", lev_name(l), parityblocks, used_paritymax);
|
|
}
|
|
|
|
/* keep the smallest parity number of blocks */
|
|
if (l == 0 || file_paritymax > parityblocks)
|
|
file_paritymax = parityblocks;
|
|
}
|
|
|
|
/* if we do a full parity realloc or computation, having a wrong parity size is expected */
|
|
if (!state->opt.force_realloc && !state->opt.force_full) {
|
|
/* if the parities are too small */
|
|
if (file_paritymax < used_paritymax) {
|
|
/* LCOV_EXCL_START */
|
|
log_fatal("DANGER! One or more the parity files are smaller than expected!\n");
|
|
if (file_paritymax != 0) {
|
|
log_fatal("If this happens because you are using an old content file,\n");
|
|
log_fatal("you can 'sync' anyway using 'snapraid --force-full sync'\n");
|
|
log_fatal("to force a full rebuild of the parity.\n");
|
|
} else {
|
|
log_fatal("It's possible that the parity disks are not mounted.\n");
|
|
log_fatal("If instead you are adding a new parity level, you can 'sync' using\n");
|
|
log_fatal("'snapraid --force-full sync' to force a full rebuild of the parity.\n");
|
|
}
|
|
exit(EXIT_FAILURE);
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
|
|
unrecoverable_error = 0;
|
|
|
|
if (state->opt.prehash) {
|
|
msg_progress("Hashing...\n");
|
|
|
|
ret = state_hash_process(state, blockstart, blockmax, &skip_sync);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
++unrecoverable_error;
|
|
/* continue, in case also doing the sync if ::skip_sync is not set */
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
|
|
if (!skip_sync) {
|
|
msg_progress("Resizing...\n");
|
|
|
|
/* now change the size of all parities */
|
|
for (l = 0; l < state->level; ++l) {
|
|
int is_modified;
|
|
|
|
/* change the size of the parity file, truncating or extending it */
|
|
/* from this point all the DELETED blocks after the end of the parity are invalid */
|
|
/* and they are automatically removed when we save the new content file */
|
|
ret = parity_chsize(&parity_handle[l], &state->parity[l], &is_modified, size, state->block_size, state->opt.skip_fallocate, state->opt.skip_space_holder);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
data_off_t out_size;
|
|
parity_size(&parity_handle[l], &out_size);
|
|
parity_overflow(state, out_size);
|
|
log_fatal("WARNING! Without an unsable %s file, it isn't possible to sync.\n", lev_name(l));
|
|
exit(EXIT_FAILURE);
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
|
|
if (is_modified)
|
|
state->need_write = 1;
|
|
}
|
|
|
|
/* after resizing parity files, refresh again the free info */
|
|
state_refresh(state);
|
|
|
|
/**
|
|
* Save the new state before the sync but after the hashing phase
|
|
*
|
|
* This allows to recover after an aborted sync, and at the same time
|
|
* it allows to recover broken copied/moved files identified in the
|
|
* hashing phase.
|
|
*
|
|
* For example, think at this case:
|
|
* - Add some files at the array
|
|
* - Run a sync command, it will recompute the parity adding the new files
|
|
* - Abort the sync command before it stores the new content file
|
|
* - Delete the not yet synced files from the array
|
|
* - Run a new sync command
|
|
*
|
|
* The sync command has no way to know that the parity file was modified
|
|
* because the files triggering these changes are now deleted and they aren't
|
|
* listed in the content file.
|
|
* Instead, saving the new content file in advance, keeps track of all the parity
|
|
* that may be modified.
|
|
*/
|
|
if (!state->opt.skip_content_write) {
|
|
if (state->need_write)
|
|
state_write(state);
|
|
} else {
|
|
log_fatal("WARNING! Skipped state write for --test-skip-content-write option.\n");
|
|
}
|
|
|
|
msg_progress("Syncing...\n");
|
|
|
|
/* skip degenerated cases of empty parity, or skipping all */
|
|
if (blockstart < blockmax) {
|
|
ret = state_sync_process(state, parity_handle, blockstart, blockmax);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
++unrecoverable_error;
|
|
/* continue, as we are already exiting */
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
} else {
|
|
msg_status("Nothing to do\n");
|
|
}
|
|
}
|
|
|
|
for (l = 0; l < state->level; ++l) {
|
|
ret = parity_close(&parity_handle[l]);
|
|
if (ret == -1) {
|
|
/* LCOV_EXCL_START */
|
|
log_fatal("DANGER! Unexpected close error in %s disk.\n", lev_name(l));
|
|
++unrecoverable_error;
|
|
/* continue, as we are already exiting */
|
|
/* LCOV_EXCL_STOP */
|
|
}
|
|
}
|
|
|
|
/* abort if required */
|
|
if (unrecoverable_error != 0)
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|