New upstream version 8.1.0

This commit is contained in:
geos_one
2025-08-10 01:34:16 +02:00
commit c891bb7105
4398 changed files with 838833 additions and 0 deletions

View File

@@ -0,0 +1,527 @@
/*
* Compatibility functions for older Linux versions
*/
#include <linux/mm.h> // for old sles10 kernels, which forgot to include it in backing-dev.h
#include <linux/backing-dev.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
#include <linux/writeback.h>
#include <os/OsCompat.h>
#include <app/App.h>
#include <app/log/Logger.h>
#include <common/Common.h>
#include <filesystem/FhgfsOpsSuper.h>
#ifndef KERNEL_HAS_MEMDUP_USER
/**
* memdup_user - duplicate memory region from user space
*
* @src: source address in user space
* @len: number of bytes to copy
*
* Returns an ERR_PTR() on failure.
*/
void *memdup_user(const void __user *src, size_t len)
{
void *p;
/*
* Always use GFP_KERNEL, since copy_from_user() can sleep and
* cause pagefault, which makes it pointless to use GFP_NOFS
* or GFP_ATOMIC.
*/
p = kmalloc(len, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
if (copy_from_user(p, src, len)) {
kfree(p);
return ERR_PTR(-EFAULT);
}
return p;
}
#endif // memdup_user, LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)
#if defined(KERNEL_HAS_SB_BDI) && !defined(KERNEL_HAS_BDI_SETUP_AND_REGISTER) && \
!defined(KERNEL_HAS_SUPER_SETUP_BDI_NAME)
/*
* For use from filesystems to quickly init and register a bdi associated
* with dirty writeback
*/
int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
unsigned int cap)
{
static atomic_long_t fhgfs_bdiSeq = ATOMIC_LONG_INIT(0);
char tmp[32];
int err;
bdi->name = name;
bdi->capabilities = cap;
err = bdi_init(bdi);
if (err)
return err;
sprintf(tmp, "%.28s%s", name, "-%d");
err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&fhgfs_bdiSeq));
if (err) {
bdi_destroy(bdi);
return err;
}
return 0;
}
#endif
/* NOTE: We can't do a feature detection for find_get_pages_tag(), as
* this function is in all headers of all supported kernel versions.
* However, it is only _exported_ since 2.6.22 and also only
* exported in RHEL >=5.10. */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
/**
* find_get_pages_tag - find and return pages that match @tag
* @mapping: the address_space to search
* @index: the starting page index
* @tag: the tag index
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
*
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
*/
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
int tag, unsigned int nr_pages, struct page **pages)
{
unsigned int i;
unsigned int ret;
read_lock_irq(&mapping->tree_lock);
ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
(void **)pages, *index, nr_pages, tag);
for (i = 0; i < ret; i++)
page_cache_get(pages[i]);
if (ret)
*index = pages[ret - 1]->index + 1;
read_unlock_irq(&mapping->tree_lock);
return ret;
}
#endif // find_get_pages_tag() for <2.6.22
#ifndef KERNEL_HAS_D_MAKE_ROOT
/**
* This is the former d_alloc_root with an additional iput on error.
*/
struct dentry *d_make_root(struct inode *root_inode)
{
struct dentry* allocRes = d_alloc_root(root_inode);
if(!allocRes)
iput(root_inode);
return allocRes;
}
#endif
#ifndef KERNEL_HAS_D_MATERIALISE_UNIQUE
/**
* d_materialise_unique() was merged into d_splice_alias() in linux-3.19
*/
struct dentry* d_materialise_unique(struct dentry *dentry, struct inode *inode)
{
return d_splice_alias(inode, dentry);
}
#endif // KERNEL_HAS_D_MATERIALISE_UNIQUE
/**
* Note: Call this once during module init (and remember to call kmem_cache_destroy() )
*/
#if defined(KERNEL_HAS_KMEMCACHE_CACHE_FLAGS_CTOR)
struct kmem_cache* OsCompat_initKmemCache(const char* cacheName, size_t cacheSize,
void initFuncPtr(void* initObj, struct kmem_cache* cache, unsigned long flags) )
#elif defined(KERNEL_HAS_KMEMCACHE_CACHE_CTOR)
struct kmem_cache* OsCompat_initKmemCache(const char* cacheName, size_t cacheSize,
void initFuncPtr(struct kmem_cache* cache, void* initObj) )
#else
struct kmem_cache* OsCompat_initKmemCache(const char* cacheName, size_t cacheSize,
void initFuncPtr(void* initObj) )
#endif // LINUX_VERSION_CODE
{
struct kmem_cache* cache;
#if defined(KERNEL_HAS_SLAB_MEM_SPREAD)
unsigned long cacheFlags = SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD;
#else
unsigned long cacheFlags = SLAB_RECLAIM_ACCOUNT;
#endif
#if defined(KERNEL_HAS_KMEMCACHE_DTOR)
cache = kmem_cache_create(cacheName, cacheSize, 0, cacheFlags, initFuncPtr, NULL);
#else
cache = kmem_cache_create(cacheName, cacheSize, 0, cacheFlags, initFuncPtr);
#endif // LINUX_VERSION_CODE
return cache;
}
#ifndef rbtree_postorder_for_each_entry_safe
static struct rb_node* rb_left_deepest_node(const struct rb_node* node)
{
for (;;)
{
if (node->rb_left)
node = node->rb_left;
else
if (node->rb_right)
node = node->rb_right;
else
return (struct rb_node*) node;
}
}
struct rb_node* rb_next_postorder(const struct rb_node* node)
{
const struct rb_node *parent;
if (!node)
return NULL;
parent = rb_parent(node);
/* If we're sitting on node, we've already seen our children */
if (parent && node == parent->rb_left && parent->rb_right)
{
/* If we are the parent's left node, go to the parent's right
* node then all the way down to the left */
return rb_left_deepest_node(parent->rb_right);
}
else
/* Otherwise we are the parent's right node, and the parent
* should be next */
return (struct rb_node*) parent;
}
struct rb_node* rb_first_postorder(const struct rb_root* root)
{
if (!root->rb_node)
return NULL;
return rb_left_deepest_node(root->rb_node);
}
#endif
#ifdef KERNEL_HAS_GENERIC_WRITE_CHECKS_ITER
int os_generic_write_checks(struct file* filp, loff_t* offset, size_t* size, int isblk)
{
struct iovec iov = { 0, *size };
struct iov_iter iter;
ssize_t checkRes;
struct kiocb iocb;
iov_iter_init(&iter, WRITE, &iov, 1, *size);
init_sync_kiocb(&iocb, filp);
iocb.ki_pos = *offset;
checkRes = generic_write_checks(&iocb, &iter);
if(checkRes < 0)
return checkRes;
*offset = iocb.ki_pos;
*size = iter.count;
return 0;
}
#endif
#ifndef KERNEL_HAS_HAVE_SUBMOUNTS
/**
* enum d_walk_ret - action to talke during tree walk
* @D_WALK_CONTINUE: contrinue walk
* @D_WALK_QUIT: quit walk
* @D_WALK_NORETRY: quit when retry is needed
* @D_WALK_SKIP: skip this dentry and its children
*/
enum d_walk_ret {
D_WALK_CONTINUE,
D_WALK_QUIT,
D_WALK_NORETRY,
D_WALK_SKIP,
};
/*
* Search for at least 1 mount point in the dentry's subdirs.
* We descend to the next level whenever the d_subdirs
* list is non-empty and continue searching.
*/
static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
{
int *ret = data;
if (d_mountpoint(dentry)) {
*ret = 1;
return D_WALK_QUIT;
}
return D_WALK_CONTINUE;
}
#if defined(KERNEL_HAS_DENTRY_SUBDIRS)
/**
* d_walk - walk the dentry tree
* @parent: start of walk
* @data: data passed to @enter() and @finish()
* @enter: callback when first entering the dentry
* @finish: callback when successfully finished the walk
*
* The @enter() and @finish() callbacks are called with d_lock held.
*/
static void d_walk(struct dentry *parent, void *data,
enum d_walk_ret (*enter)(void *, struct dentry *),
void (*finish)(void *))
{
struct dentry *this_parent;
struct list_head *next;
unsigned seq = 0;
enum d_walk_ret ret;
bool retry = true;
again:
read_seqbegin_or_lock(&rename_lock, &seq);
this_parent = parent;
spin_lock(&this_parent->d_lock);
ret = enter(data, this_parent);
switch (ret) {
case D_WALK_CONTINUE:
break;
case D_WALK_QUIT:
case D_WALK_SKIP:
goto out_unlock;
case D_WALK_NORETRY:
retry = false;
break;
}
repeat:
next = this_parent->d_subdirs.next;
resume:
while (next != &this_parent->d_subdirs) {
struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
next = tmp->next;
if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
continue;
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
ret = enter(data, dentry);
switch (ret) {
case D_WALK_CONTINUE:
break;
case D_WALK_QUIT:
spin_unlock(&dentry->d_lock);
goto out_unlock;
case D_WALK_NORETRY:
retry = false;
break;
case D_WALK_SKIP:
spin_unlock(&dentry->d_lock);
continue;
}
if (!list_empty(&dentry->d_subdirs)) {
spin_unlock(&this_parent->d_lock);
#if defined(KERNEL_SPIN_RELEASE_HAS_3_ARGUMENTS)
spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
#else
spin_release(&dentry->d_lock.dep_map, _RET_IP_);
#endif
this_parent = dentry;
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat;
}
spin_unlock(&dentry->d_lock);
}
/*
* All done at this level ... ascend and resume the search.
*/
rcu_read_lock();
ascend:
if (this_parent != parent) {
struct dentry *child = this_parent;
this_parent = child->d_parent;
spin_unlock(&child->d_lock);
spin_lock(&this_parent->d_lock);
/* might go back up the wrong parent if we have had a rename. */
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
/* go into the first sibling still alive */
do {
next = child->d_child.next;
if (next == &this_parent->d_subdirs)
goto ascend;
child = list_entry(next, struct dentry, d_child);
} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
rcu_read_unlock();
goto resume;
}
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
rcu_read_unlock();
if (finish)
finish(data);
out_unlock:
spin_unlock(&this_parent->d_lock);
done_seqretry(&rename_lock, seq);
return;
rename_retry:
spin_unlock(&this_parent->d_lock);
rcu_read_unlock();
BUG_ON(seq & 1);
if (!retry)
return;
seq = 1;
goto again;
}
#else
/**
* d_walk - walk the dentry tree
* @parent: start of walk
* @data: data passed to @enter() and @finish()
* @enter: callback when first entering the dentry
*
* The @enter() callbacks are called with d_lock held.
*/
static void d_walk(struct dentry *parent, void *data,
enum d_walk_ret (*enter)(void *, struct dentry *))
{
struct dentry *this_parent, *dentry;
unsigned seq = 0;
enum d_walk_ret ret;
bool retry = true;
again:
read_seqbegin_or_lock(&rename_lock, &seq);
this_parent = parent;
spin_lock(&this_parent->d_lock);
ret = enter(data, this_parent);
switch (ret) {
case D_WALK_CONTINUE:
break;
case D_WALK_QUIT:
case D_WALK_SKIP:
goto out_unlock;
case D_WALK_NORETRY:
retry = false;
break;
}
repeat:
dentry = d_first_child(this_parent);
resume:
hlist_for_each_entry_from(dentry, d_sib) {
if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
continue;
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
ret = enter(data, dentry);
switch (ret) {
case D_WALK_CONTINUE:
break;
case D_WALK_QUIT:
spin_unlock(&dentry->d_lock);
goto out_unlock;
case D_WALK_NORETRY:
retry = false;
break;
case D_WALK_SKIP:
spin_unlock(&dentry->d_lock);
continue;
}
if (!hlist_empty(&dentry->d_children)) {
spin_unlock(&this_parent->d_lock);
spin_release(&dentry->d_lock.dep_map, _RET_IP_);
this_parent = dentry;
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat;
}
spin_unlock(&dentry->d_lock);
}
/*
* All done at this level ... ascend and resume the search.
*/
rcu_read_lock();
ascend:
if (this_parent != parent) {
dentry = this_parent;
this_parent = dentry->d_parent;
spin_unlock(&dentry->d_lock);
spin_lock(&this_parent->d_lock);
/* might go back up the wrong parent if we have had a rename. */
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
/* go into the first sibling still alive */
hlist_for_each_entry_continue(dentry, d_sib) {
if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
rcu_read_unlock();
goto resume;
}
}
goto ascend;
}
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
rcu_read_unlock();
out_unlock:
spin_unlock(&this_parent->d_lock);
done_seqretry(&rename_lock, seq);
return;
rename_retry:
spin_unlock(&this_parent->d_lock);
rcu_read_unlock();
BUG_ON(seq & 1);
if (!retry)
return;
seq = 1;
goto again;
}
#endif
/**
* have_submounts - check for mounts over a dentry
* @parent: dentry to check.
*
* Return true if the parent or its subdirectories contain
* a mount point
*/
int have_submounts(struct dentry *parent)
{
int ret = 0;
#if defined(KERNEL_HAS_DENTRY_SUBDIRS)
d_walk(parent, &ret, check_mount, NULL);
#else
d_walk(parent, &ret, check_mount);
#endif
return ret;
}
#endif

View File

@@ -0,0 +1,396 @@
/*
* Compatibility functions for older Linux versions
*/
#ifndef OSCOMPAT_H_
#define OSCOMPAT_H_
#include <common/Common.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/compat.h>
#include <linux/list.h>
#include <linux/mount.h>
#include <linux/posix_acl_xattr.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/semaphore.h>
#ifndef KERNEL_HAS_MEMDUP_USER
extern void *memdup_user(const void __user *src, size_t len);
#endif
#ifndef KERNEL_HAS_D_MAKE_ROOT
extern struct dentry *d_make_root(struct inode *root_inode);
#endif
#if defined(KERNEL_HAS_SB_BDI) && !defined(KERNEL_HAS_BDI_SETUP_AND_REGISTER)
extern int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, unsigned int cap);
#endif
#ifndef KERNEL_HAS_HAVE_SUBMOUNTS
extern int have_submounts(struct dentry *parent);
#endif
/*
* PG_error and SetPageError() have been deprecated and removed in Linux 6.12.
* We now use mapping_set_error() to record writeback errors at the address_space level.
*
* This ensures compatibility with kernels >= 4.19 and aligns with the new writeback
* error tracking model using errseq_t (see LWN: https://lwn.net/Articles/724307/).
*
* BeeGFS compatibility:
* - Buffered mode paths already use filemap_fdatawait(), which calls filemap_check_errors().
* - Native mode uses file_write_and_wait_range(), which calls file_check_and_advance_wb_err().
*/
/**
* fhgfs_set_wb_error - Record a writeback error at the mapping level
*
* Replaces SetPageError(); safe across all supported kernels.
*
* @page: the page associated with the mapping
* @err: the error code
*/
static inline void fhgfs_set_wb_error(struct page *page, int err)
{
if (page && page->mapping && err)
mapping_set_error(page->mapping, err);
}
/**
* generic_permission() compatibility function
*
* NOTE: Only kernels > 2.6.32 do have inode->i_op->check_acl, but as we do not
* support it anyway for now, we do not need a complete kernel version check for it.
* Also, in order to skip useless pointer references we just pass NULL here.
*/
static inline int os_generic_permission(struct inode *inode, int mask)
{
#ifdef KERNEL_HAS_GENERIC_PERMISSION_2
return generic_permission(inode, mask);
#elif defined(KERNEL_HAS_GENERIC_PERMISSION_4)
return generic_permission(inode, mask, 0, NULL);
#elif defined(KERNEL_HAS_IDMAPPED_MOUNTS)
return generic_permission(&nop_mnt_idmap, inode, mask);
#elif defined(KERNEL_HAS_USER_NS_MOUNTS)
return generic_permission(&init_user_ns, inode, mask);
#else
return generic_permission(inode, mask, NULL);
#endif
}
#if defined(KERNEL_HAS_GENERIC_FILLATTR_REQUEST_MASK)
static inline void os_generic_fillattr(struct inode *inode, struct kstat *kstat, u32 request_mask)
#else
static inline void os_generic_fillattr(struct inode *inode, struct kstat *kstat)
#endif
{
#if defined(KERNEL_HAS_IDMAPPED_MOUNTS)
#if defined(KERNEL_HAS_GENERIC_FILLATTR_REQUEST_MASK)
generic_fillattr(&nop_mnt_idmap, request_mask, inode, kstat);
#else
generic_fillattr(&nop_mnt_idmap, inode, kstat);
#endif // KERNEL_HAS_GENERIC_FILLATTR_REQUEST_MASK
#elif defined(KERNEL_HAS_USER_NS_MOUNTS)
generic_fillattr(&init_user_ns, inode, kstat);
#else
generic_fillattr(inode, kstat);
#endif
}
#ifdef KERNEL_HAS_SETATTR_PREPARE
static inline int os_setattr_prepare(struct dentry *dentry, struct iattr *attr)
{
#if defined(KERNEL_HAS_IDMAPPED_MOUNTS)
return setattr_prepare(&nop_mnt_idmap, dentry, attr);
#elif defined(KERNEL_HAS_USER_NS_MOUNTS)
return setattr_prepare(&init_user_ns, dentry, attr);
#else
return setattr_prepare(dentry, attr);
#endif
}
#endif // KERNEL_HAS_SETATTR_PREPARE
static inline bool os_inode_owner_or_capable(const struct inode *inode)
{
#if defined(KERNEL_HAS_IDMAPPED_MOUNTS)
return inode_owner_or_capable(&nop_mnt_idmap, inode);
#elif defined(KERNEL_HAS_USER_NS_MOUNTS)
return inode_owner_or_capable(&init_user_ns, inode);
#else
return inode_owner_or_capable(inode);
#endif
}
#ifndef KERNEL_HAS_D_MATERIALISE_UNIQUE
extern struct dentry* d_materialise_unique(struct dentry *dentry, struct inode *inode);
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
/**
* Taken from ext3 dir.c. is_compat_task() does work for all kernels, although it was already there.
* So we are conservativ and only allow it for recent kernels.
*/
static inline int is_32bit_api(void)
{
#ifdef CONFIG_COMPAT
# ifdef in_compat_syscall
return in_compat_syscall();
# else
return is_compat_task();
# endif
#else
return (BITS_PER_LONG == 32);
#endif
}
#else
static inline int is_32bit_api(void)
{
return (BITS_PER_LONG == 32);
}
#endif // LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
#ifndef KERNEL_HAS_I_UID_READ
static inline uid_t i_uid_read(const struct inode *inode)
{
return inode->i_uid;
}
static inline gid_t i_gid_read(const struct inode *inode)
{
return inode->i_gid;
}
static inline void i_uid_write(struct inode *inode, uid_t uid)
{
inode->i_uid = uid;
}
static inline void i_gid_write(struct inode *inode, gid_t gid)
{
inode->i_gid = gid;
}
#endif // KERNEL_HAS_I_UID_READ
#if defined(KERNEL_HAS_KMEMCACHE_CACHE_FLAGS_CTOR)
struct kmem_cache* OsCompat_initKmemCache(const char* cacheName, size_t cacheSize,
void initFuncPtr(void* initObj, struct kmem_cache* cache, unsigned long flags) );
#elif defined(KERNEL_HAS_KMEMCACHE_CACHE_CTOR)
struct kmem_cache* OsCompat_initKmemCache(const char* cacheName, size_t cacheSize,
void initFuncPtr(struct kmem_cache* cache, void* initObj) );
#else
struct kmem_cache* OsCompat_initKmemCache(const char* cacheName, size_t cacheSize,
void initFuncPtr(void* initObj) );
#endif // LINUX_VERSION_CODE
// added to 3.13, backported to -stable
#ifndef list_next_entry
/**
* list_next_entry - get the next element in list
* @pos: the type * to cursor
* @member: the name of the list_struct within the struct.
*/
#define list_next_entry(pos, member) \
list_entry((pos)->member.next, typeof(*(pos)), member)
#endif
#ifndef list_first_entry
/**
* list_first_entry - get the first element from a list
* @ptr: the list head to take the element from.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_struct within the struct.
*
* Note, that list is expected to be not empty.
*/
#define list_first_entry(ptr, type, member) \
list_entry((ptr)->next, type, member)
#endif // list_first_entry
static inline struct posix_acl* os_posix_acl_from_xattr(const void* value, size_t size)
{
#ifndef KERNEL_HAS_POSIX_ACL_XATTR_USERNS_ARG
return posix_acl_from_xattr(value, size);
#else
return posix_acl_from_xattr(&init_user_ns, value, size);
#endif
}
static inline int os_posix_acl_to_xattr(const struct posix_acl* acl, void* buffer, size_t size)
{
#ifndef KERNEL_HAS_POSIX_ACL_XATTR_USERNS_ARG
return posix_acl_to_xattr(acl, buffer, size);
#else
return posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
#endif
}
#if defined(KERNEL_HAS_SET_ACL) || defined(KERNEL_HAS_SET_ACL_DENTRY)
static inline int os_posix_acl_chmod(struct dentry *dentry, umode_t mode)
{
#if defined(KERNEL_HAS_IDMAPPED_MOUNTS)
return posix_acl_chmod(&nop_mnt_idmap, dentry, mode);
#elif defined(KERNEL_HAS_POSIX_ACL_CHMOD_NS_DENTRY)
return posix_acl_chmod(&init_user_ns, dentry, mode);
#elif defined(KERNEL_HAS_USER_NS_MOUNTS)
return posix_acl_chmod(&init_user_ns, dentry->d_inode, mode);
#else
return posix_acl_chmod(dentry->d_inode, mode);
#endif
}
#endif // KERNEL_HAS_SET_ACL || KERNEL_HAS_SET_ACL_DENTRY
#ifndef KERNEL_HAS_PAGE_ENDIO
static inline void page_endio(struct page *page, int rw, int err)
{
if (rw == READ)
{
if (!err)
{
SetPageUptodate(page);
}
else
{
ClearPageUptodate(page);
fhgfs_set_wb_error(page, err);
}
unlock_page(page);
}
else
{ /* rw == WRITE */
if (err)
{
fhgfs_set_wb_error(page, err);
}
end_page_writeback(page);
}
}
#endif
#ifndef KERNEL_HAS_GENERIC_WRITE_CHECKS_ITER
# define os_generic_write_checks generic_write_checks
#else
extern int os_generic_write_checks(struct file* filp, loff_t* offset, size_t* size, int isblk);
#endif
#ifndef rb_entry_safe
#define rb_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \
____ptr ? rb_entry(____ptr, type, member) : NULL; \
})
#endif
#ifndef rbtree_postorder_for_each_entry_safe
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
typeof(*pos), field); 1; }); \
pos = n)
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);
#endif
#ifndef KERNEL_HAS_CURRENT_UMASK
#define current_umask() (current->fs->umask)
#endif
#ifndef XATTR_NAME_POSIX_ACL_ACCESS
# define XATTR_POSIX_ACL_ACCESS "posix_acl_access"
# define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS
# define XATTR_POSIX_ACL_DEFAULT "posix_acl_default"
# define XATTR_NAME_POSIX_ACL_DEFAULT XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_DEFAULT
#endif
#ifndef KERNEL_HAS_I_MMAP_LOCK
static inline void i_mmap_lock_read(struct address_space* mapping)
{
#if defined(KERNEL_HAS_I_MMAP_RWSEM)
down_read(&mapping->i_mmap_rwsem);
#elif defined(KERNEL_HAS_I_MMAP_MUTEX)
mutex_lock(&mapping->i_mmap_mutex);
#else
spin_lock(&mapping->i_mmap_lock);
#endif
}
static inline void i_mmap_unlock_read(struct address_space* mapping)
{
#if defined(KERNEL_HAS_I_MMAP_RWSEM)
up_read(&mapping->i_mmap_rwsem);
#elif defined(KERNEL_HAS_I_MMAP_MUTEX)
mutex_unlock(&mapping->i_mmap_mutex);
#else
spin_unlock(&mapping->i_mmap_lock);
#endif
}
#endif
static inline bool beegfs_hasMappings(struct inode* inode)
{
#if defined(KERNEL_HAS_I_MMAP_RBTREE)
if (!RB_EMPTY_ROOT(&inode->i_mapping->i_mmap))
return true;
#elif defined(KERNEL_HAS_I_MMAP_CACHED_RBTREE)
if (!RB_EMPTY_ROOT(&inode->i_mapping->i_mmap.rb_root))
return true;
#else
if (!prio_tree_empty(&inode->i_mapping->i_mmap))
return true;
#endif
#ifdef KERNEL_HAS_I_MMAP_NONLINEAR
if (!list_empty(&inode->i_mapping->i_mmap_nonlinear))
return true;
#endif
return false;
}
#ifndef KERNEL_HAS_INODE_LOCK
static inline void os_inode_lock(struct inode* inode)
{
mutex_lock(&inode->i_mutex);
}
static inline void os_inode_unlock(struct inode* inode)
{
mutex_unlock(&inode->i_mutex);
}
#else
static inline void os_inode_lock(struct inode* inode)
{
inode_lock(inode);
}
static inline void os_inode_unlock(struct inode* inode)
{
inode_unlock(inode);
}
#endif
#if defined(KERNEL_ACCESS_OK_WANTS_TYPE)
# define os_access_ok(type, addr, size) access_ok(type, addr, size)
#else
# define os_access_ok(type, addr, size) access_ok(addr, size)
#endif
#endif /* OSCOMPAT_H_ */

View File

@@ -0,0 +1,117 @@
#include <common/Common.h>
#include <os/OsDeps.h>
#include <common/FhgfsTypes.h>
#include <common/net/sock/NicAddress.h>
#include <common/net/sock/Socket.h>
#include <filesystem/FhgfsOps_versions.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/inetdevice.h>
#ifdef CONFIG_STACKTRACE
#include <linux/stacktrace.h>
#endif
#define MAX_STACK_TRACE_CHAIN 16 // number of functions to to save in a stack trace
#ifdef BEEGFS_DEBUG
// Significant parts of the kernel code around struct stack_trace are removed
// when CONFIG_ARCH_STACKWALK is set. Code below needs to be rewritten to work
// with newer kernels that have CONFIG_ARCH_STACKWALK enabled.
#if defined CONFIG_STACKTRACE && !defined CONFIG_ARCH_STACKWALK
/**
* Save a given trace. NOTE: Allocated memory has to be freed later on!
*/
void* os_saveStackTrace(void)
{
struct stack_trace* trace;
unsigned long *entries;
trace = kmalloc(sizeof(struct stack_trace), GFP_NOFS);
if (!trace)
return NULL; // out of memory?
entries = kmalloc(MAX_STACK_TRACE_CHAIN * sizeof(*entries), GFP_NOFS);
if (!entries)
{ // out of memory?
kfree(trace);
return NULL;
}
trace->nr_entries = 0;
trace->max_entries = MAX_STACK_TRACE_CHAIN;
trace->entries = entries;
trace->skip = 1; // cut off ourself, so 1
save_stack_trace(trace);
return trace;
}
void os_freeStackTrace(void *trace)
{
struct stack_trace* os_trace = (struct stack_trace*)trace;
if (!trace)
{ // May be NULL, if kmalloc or vmalloc failed
return;
}
kfree(os_trace->entries);
kfree(os_trace);
}
/**
* Print a stack trace
*
* @param trace The stack trace to print
* @param spaces Insert 'spaces' white-spaces at the beginning of the line
*/
void os_printStackTrace(void* trace, int spaces)
{
if (!trace)
{ // Maybe NULL, if kmalloc or vmalloc failed
return;
}
{
struct stack_trace *stack_trace = trace;
#if defined(KERNEL_HAS_PRINT_STACK_TRACE)
print_stack_trace(stack_trace, spaces);
#elif defined(KERNEL_HAS_STACK_TRACE_PRINT)
stack_trace_print(stack_trace->entries, stack_trace->nr_entries, spaces);
#else
(void) stack_trace;
#endif
}
}
#else // no CONFIG_STACKTRACE or CONFIG_ARCH_STACKWALK enabled => nothing to do at all
void* os_saveStackTrace(void)
{
return NULL;
}
void os_printStackTrace(void* trace, int spaces)
{
printk_fhgfs(KERN_INFO, "Kernel without stack trace support!\n");
return;
}
void os_freeStackTrace(void* trace)
{
return;
}
#endif // CONFIG_STACKTRACE && !CONFIG_ARCH_STACKWALK
#endif // BEEGFS_DEBUG

View File

@@ -0,0 +1,71 @@
#ifndef OPEN_OSDEPS_H_
#define OPEN_OSDEPS_H_
#include <filesystem/FhgfsOps_versions.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#ifdef BEEGFS_DEBUG
extern void* os_saveStackTrace(void);
extern void os_printStackTrace(void * trace, int spaces);
extern void os_freeStackTrace(void *trace);
#endif // BEEGFS_DEBUG
// inliners
static inline void* os_kmalloc(size_t size);
static inline void* os_kzalloc(size_t size);
static inline int os_strnicmp(const char* s1, const char* s2, size_t n);
void* os_kmalloc(size_t size)
{
void* buf = kmalloc(size, GFP_NOFS);
if(unlikely(!buf) )
{
printk(KERN_WARNING BEEGFS_MODULE_NAME_STR ": kmalloc of '%d' bytes failed. Retrying...\n", (int)size);
buf = kmalloc(size, GFP_NOFS | __GFP_NOFAIL);
printk(KERN_WARNING BEEGFS_MODULE_NAME_STR ": kmalloc retry of '%d' bytes succeeded\n", (int)size);
}
return buf;
}
void* os_kzalloc(size_t size)
{
void* buf = kzalloc(size, GFP_NOFS);
if(unlikely(!buf) )
{
printk(KERN_WARNING BEEGFS_MODULE_NAME_STR ": kzalloc of '%d' bytes failed. Retrying...\n", (int)size);
buf = kzalloc(size, GFP_NOFS | __GFP_NOFAIL);
printk(KERN_WARNING BEEGFS_MODULE_NAME_STR ": kzalloc retry of '%d' bytes succeeded\n", (int)size);
}
return buf;
}
/**
* strncasecmp was broken in the linux kernel pre-3.18. strnicmp was
* implemented correctly in that timeframe. In kernel >= 3.18, strnicmp
* is either a wrapper for strncasecmp or is not defined.
*/
int os_strnicmp(const char *s1, const char *s2, size_t n)
{
#ifdef KERNEL_HAS_STRNICMP
return strnicmp(s1, s2, n);
#else
return strncasecmp(s1, s2, n);
#endif
}
#endif /* OPEN_OSDEPS_H_ */

View File

@@ -0,0 +1,201 @@
#ifndef OSTYPECONVERSION_INTERNAL_H_
#define OSTYPECONVERSION_INTERNAL_H_
#include <common/Common.h>
#include <os/OsTypeConversion.h>
#include <common/toolkit/Time.h>
#include <common/storage/StorageDefinitions.h>
#include <linux/fs.h>
#if defined(KERNEL_HAS_LINUX_FILELOCK_H)
#include <linux/filelock.h>
#endif
static inline int OsTypeConv_openFlagsOsToFhgfs(int osFlags, bool isPagedMode);
static inline void OsTypeConv_kstatFhgfsToOs(fhgfs_stat* fhgfsStat, struct kstat* kStat);
static inline void OsTypeConv_iattrOsToFhgfs(struct iattr* iAttr, SettableFileAttribs* fhgfsAttr,
int* outValidAttribs);
static inline unsigned OsTypeConv_dirEntryTypeToOS(DirEntryType entryType);
static inline int OsTypeConv_flockTypeToFhgfs(struct file_lock* fileLock);
/**
* @param osFlags file open mode flags
* @return OPENFILE_ACCESS_... flags
*/
int OsTypeConv_openFlagsOsToFhgfs(int osFlags, bool isPagedMode)
{
int fhgfsFlags = 0;
if(osFlags & O_RDWR)
fhgfsFlags |= OPENFILE_ACCESS_READWRITE;
else
if(osFlags & O_WRONLY)
{
if (!isPagedMode)
fhgfsFlags |= OPENFILE_ACCESS_WRITE;
else
{ /* in order to update read-modify-write pages with the storage content we a
* read-write handle */
fhgfsFlags |= OPENFILE_ACCESS_READWRITE;
}
}
else
fhgfsFlags |= OPENFILE_ACCESS_READ;
if(osFlags & O_APPEND)
fhgfsFlags |= OPENFILE_ACCESS_APPEND;
if(osFlags & O_TRUNC)
fhgfsFlags |= OPENFILE_ACCESS_TRUNC;
if(osFlags & O_DIRECT)
fhgfsFlags |= OPENFILE_ACCESS_DIRECT;
if(osFlags & O_SYNC)
fhgfsFlags |= OPENFILE_ACCESS_SYNC;
return fhgfsFlags;
}
/**
* @param kStat unused fields will be set to zero
*/
void OsTypeConv_kstatFhgfsToOs(fhgfs_stat* fhgfsStat, struct kstat* kStat)
{
memset(kStat, 0, sizeof(*kStat) );
kStat->mode = fhgfsStat->mode;
kStat->nlink = fhgfsStat->nlink;
kStat->uid = make_kuid(&init_user_ns, fhgfsStat->uid);
kStat->gid = make_kgid(&init_user_ns, fhgfsStat->gid);
kStat->size = fhgfsStat->size;
kStat->blocks = fhgfsStat->blocks;
kStat->atime.tv_sec = fhgfsStat->atime.tv_sec;
kStat->atime.tv_nsec = fhgfsStat->atime.tv_nsec;
kStat->mtime.tv_sec = fhgfsStat->mtime.tv_sec;
kStat->mtime.tv_nsec = fhgfsStat->mtime.tv_nsec;
kStat->ctime.tv_sec = fhgfsStat->ctime.tv_sec; // attrib change time (not creation time)
kStat->ctime.tv_nsec = fhgfsStat->ctime.tv_nsec; // attrib change time (not creation time)
}
/**
* Convert kernel iattr to fhgfsAttr. Also update the inode with the new attributes.
*/
void OsTypeConv_iattrOsToFhgfs(struct iattr* iAttr, SettableFileAttribs* fhgfsAttr,
int* outValidAttribs)
{
Time now;
Time_setToNowReal(&now);
*outValidAttribs = 0;
if(iAttr->ia_valid & ATTR_MODE)
{
(*outValidAttribs) |= SETATTR_CHANGE_MODE;
fhgfsAttr->mode = iAttr->ia_mode;
}
if(iAttr->ia_valid & ATTR_UID)
{
(*outValidAttribs) |= SETATTR_CHANGE_USERID;
fhgfsAttr->userID = from_kuid(&init_user_ns, iAttr->ia_uid);
}
if(iAttr->ia_valid & ATTR_GID)
{
(*outValidAttribs) |= SETATTR_CHANGE_GROUPID;
fhgfsAttr->groupID = from_kgid(&init_user_ns, iAttr->ia_gid);
}
if(iAttr->ia_valid & ATTR_MTIME_SET)
{
(*outValidAttribs) |= SETATTR_CHANGE_MODIFICATIONTIME;
fhgfsAttr->modificationTimeSecs = iAttr->ia_mtime.tv_sec;
}
else
if(iAttr->ia_valid & ATTR_MTIME)
{ // set mtime to "now"
(*outValidAttribs) |= SETATTR_CHANGE_MODIFICATIONTIME;
fhgfsAttr->modificationTimeSecs = now.tv_sec;
}
if(iAttr->ia_valid & ATTR_ATIME_SET)
{
(*outValidAttribs) |= SETATTR_CHANGE_LASTACCESSTIME;
fhgfsAttr->lastAccessTimeSecs = iAttr->ia_atime.tv_sec;
}
else
if(iAttr->ia_valid & ATTR_ATIME)
{ // set atime to "now"
(*outValidAttribs) |= SETATTR_CHANGE_LASTACCESSTIME;
fhgfsAttr->lastAccessTimeSecs = now.tv_sec;
}
}
/**
* Convert fhgfs DirEntryType to OS DT_... for readdir()'s filldir.
*/
unsigned OsTypeConv_dirEntryTypeToOS(DirEntryType entryType)
{
if(DirEntryType_ISDIR(entryType) )
return DT_DIR;
if(DirEntryType_ISREGULARFILE(entryType) )
return DT_REG;
if(DirEntryType_ISSYMLINK(entryType) )
return DT_LNK;
if(DirEntryType_ISBLOCKDEV(entryType) )
return DT_BLK;
if(DirEntryType_ISCHARDEV(entryType) )
return DT_CHR;
if(DirEntryType_ISFIFO(entryType) )
return DT_FIFO;
if(DirEntryType_ISSOCKET(entryType) )
return DT_SOCK;
return DT_UNKNOWN;
}
/**
* Convert the OS F_..LCK lock type flags of an flock operation to fhgfs ENTRYLOCKTYPE_... lock type
* flags.
*
* @
*/
static inline int OsTypeConv_flockTypeToFhgfs(struct file_lock* fileLock)
{
int fhgfsLockFlags = 0;
switch(FhgfsCommon_getFileLockType(fileLock))
{
case F_RDLCK:
{
fhgfsLockFlags = ENTRYLOCKTYPE_SHARED;
} break;
case F_WRLCK:
{
fhgfsLockFlags = ENTRYLOCKTYPE_EXCLUSIVE;
} break;
default:
{
fhgfsLockFlags = ENTRYLOCKTYPE_UNLOCK;
} break;
}
if(!(FhgfsCommon_getFileLockFlags(fileLock) & FL_SLEEP) )
fhgfsLockFlags |= ENTRYLOCKTYPE_NOWAIT;
return fhgfsLockFlags;
}
#endif /* OSTYPECONVERSION_INTERNAL_H_ */

View File

@@ -0,0 +1,215 @@
#include <common/Common.h>
#include <asm/atomic.h> // also adds ATOMIC64_INIT if available
#ifndef ATOMIC64_INIT // basic test if the kernel already provides atomic64_t
/*
* Note: Below is the atomic64.c copied and modified from linux-git, for architectures, which do
* not support native 64-bit spin-locks in hardware. As we need to have compatibility with
* older kernels we had to replace the usage of raw_spin_locks. This is probably
* slower and therefore the in-kernel implementation should be used if available.
*/
/*
* Generic implementation of 64-bit atomics using spinlocks,
* useful on processors that don't have 64-bit atomic instructions.
*
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
#include <linux/cache.h>
#include <linux/spinlock.h>
#include <linux/init.h>
// #include <linux/export.h> // disabled as not available in 2.6.16
// #include <linux/atomic.h> // disabled as not available in 2.6.16
#include "atomic64.h" // added for fhgfs
#if 0 // disabled for the simplified fhgfs version
/*
* We use a hashed array of spinlocks to provide exclusive access
* to each atomic64_t variable. Since this is expected to used on
* systems with small numbers of CPUs (<= 4 or so), we use a
* relatively small array of 16 spinlocks to avoid wasting too much
* memory on the spinlock array.
*/
#define NR_LOCKS 16
/*
* Ensure each lock is in a separate cacheline.
*/
static union {
spinlock_t lock;
char pad[L1_CACHE_BYTES];
} atomic64_lock[NR_LOCKS] __cacheline_aligned_in_smp = {
[0 ... (NR_LOCKS - 1)] = {
.lock = __RAW_SPIN_LOCK_UNLOCKED(atomic64_lock.lock),
},
};
static inline spinlock_t *lock_addr(const atomic64_t *v)
{
unsigned long addr = (unsigned long) v;
addr >>= L1_CACHE_SHIFT;
addr ^= (addr >> 8) ^ (addr >> 16);
return &atomic64_lock[addr & (NR_LOCKS - 1)].lock;
}
#endif
/**
* Simplified version for fhgfs
*/
static inline spinlock_t *lock_addr(const atomic64_t *v)
{
atomic64_t* value = (atomic64_t*) v;
return &value->lock;
}
long long atomic64_read(const atomic64_t *v)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
long long val;
spin_lock_irqsave(lock, flags);
val = v->counter;
spin_unlock_irqrestore(lock, flags);
return val;
}
// EXPORT_SYMBOL(atomic64_read);
void atomic64_set(atomic64_t *v, long long i)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
spin_lock_irqsave(lock, flags);
v->counter = i;
spin_unlock_irqrestore(lock, flags);
}
// EXPORT_SYMBOL(atomic64_set);
void atomic64_add(long long a, atomic64_t *v)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
spin_lock_irqsave(lock, flags);
v->counter += a;
spin_unlock_irqrestore(lock, flags);
}
// EXPORT_SYMBOL(atomic64_add);
long long atomic64_add_return(long long a, atomic64_t *v)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
long long val;
spin_lock_irqsave(lock, flags);
val = v->counter += a;
spin_unlock_irqrestore(lock, flags);
return val;
}
// EXPORT_SYMBOL(atomic64_add_return);
void atomic64_sub(long long a, atomic64_t *v)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
spin_lock_irqsave(lock, flags);
v->counter -= a;
spin_unlock_irqrestore(lock, flags);
}
// EXPORT_SYMBOL(atomic64_sub);
long long atomic64_sub_return(long long a, atomic64_t *v)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
long long val;
spin_lock_irqsave(lock, flags);
val = v->counter -= a;
spin_unlock_irqrestore(lock, flags);
return val;
}
// EXPORT_SYMBOL(atomic64_sub_return);
long long atomic64_dec_if_positive(atomic64_t *v)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
long long val;
spin_lock_irqsave(lock, flags);
val = v->counter - 1;
if (val >= 0)
v->counter = val;
spin_unlock_irqrestore(lock, flags);
return val;
}
// EXPORT_SYMBOL(atomic64_dec_if_positive);
long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
long long val;
spin_lock_irqsave(lock, flags);
val = v->counter;
if (val == o)
v->counter = n;
spin_unlock_irqrestore(lock, flags);
return val;
}
// EXPORT_SYMBOL(atomic64_cmpxchg);
long long atomic64_xchg(atomic64_t *v, long long new)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
long long val;
spin_lock_irqsave(lock, flags);
val = v->counter;
v->counter = new;
spin_unlock_irqrestore(lock, flags);
return val;
}
// EXPORT_SYMBOL(atomic64_xchg);
int atomic64_add_unless(atomic64_t *v, long long a, long long u)
{
unsigned long flags;
spinlock_t *lock = lock_addr(v);
int ret = 0;
spin_lock_irqsave(lock, flags);
if (v->counter != u) {
v->counter += a;
ret = 1;
}
spin_unlock_irqrestore(lock, flags);
return ret;
}
// EXPORT_SYMBOL(atomic64_add_unless);
#endif // #ifndef ATOMIC64_INIT

View File

@@ -0,0 +1,69 @@
#include <common/Common.h>
#include <asm/atomic.h> // also adds ATOMIC64_INIT if available
#ifndef ATOMIC64_INIT // basic test if the kernel already provides atomic64_t
/*
* Note: Below is the atomic64.c copied from linux-git
*/
/*
* Generic implementation of 64-bit atomics using spinlocks,
* useful on processors that don't have 64-bit atomic instructions.
*
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ASM_GENERIC_ATOMIC64_H
#define _ASM_GENERIC_ATOMIC64_H
typedef struct {
long long counter;
spinlock_t lock; // added for fhgfs
} atomic64_t;
// #define ATOMIC64_INIT(i) { (i) } // disabled for fhgfs
static inline void atomic_init(atomic64_t *atomic, uint64_t value); // added for fhgfs
extern long long atomic64_read(const atomic64_t *v);
extern void atomic64_set(atomic64_t *v, long long i);
extern void atomic64_add(long long a, atomic64_t *v);
extern long long atomic64_add_return(long long a, atomic64_t *v);
extern void atomic64_sub(long long a, atomic64_t *v);
extern long long atomic64_sub_return(long long a, atomic64_t *v);
extern long long atomic64_dec_if_positive(atomic64_t *v);
extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
extern long long atomic64_xchg(atomic64_t *v, long long new);
extern int atomic64_add_unless(atomic64_t *v, long long a, long long u);
#define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
#define atomic64_inc(v) atomic64_add(1LL, (v))
#define atomic64_inc_return(v) atomic64_add_return(1LL, (v))
#define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
#define atomic64_sub_and_test(a, v) (atomic64_sub_return((a), (v)) == 0)
#define atomic64_dec(v) atomic64_sub(1LL, (v))
#define atomic64_dec_return(v) atomic64_sub_return(1LL, (v))
#define atomic64_dec_and_test(v) (atomic64_dec_return((v)) == 0)
#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1LL, 0LL)
/*
* Initializer for fhgfs, replacement for ATOMIC64_INIT(i)
*/
void atomic_init(atomic64_t* atomic, uint64_t value)
{
spin_lock_init(&atomic->lock);
atomic->counter = value;
}
#endif /* _ASM_GENERIC_ATOMIC64_H */
#endif // #ifndef ATOMIC64_INIT

View File

@@ -0,0 +1,144 @@
#include <os/iov_iter.h>
#include <linux/mm.h>
#include <linux/slab.h>
static void beegfs_readsink_reserve_no_pipe(BeeGFS_ReadSink *rs, struct iov_iter *iter, size_t size)
{
rs->sanitized_iter = *iter;
iov_iter_truncate(&rs->sanitized_iter, size);
}
#ifdef KERNEL_HAS_ITER_PIPE
static size_t compute_max_pagecount(size_t size)
{
// Compute maximal number of pages (in the pipe) that need to be present at once.
// We don't know the page-relative offset from which max_size bytes will be reserved.
// Assume the worst case.
size_t max_offset = PAGE_SIZE - 1;
size_t max_pages = (max_offset + size + PAGE_SIZE - 1) / PAGE_SIZE;
return max_pages;
}
static void beegfs_readsink_reserve_pipe(BeeGFS_ReadSink *rs, struct iov_iter *iter, size_t size)
{
size_t max_pages;
// struct should be zeroed
BUG_ON(rs->npages != 0);
BUG_ON(rs->pages != 0);
BUG_ON(rs->bvecs != 0);
// should we disallow size > iter count?
size = min_t(size_t, size, iov_iter_count(iter));
max_pages = compute_max_pagecount(size);
// Could be kmalloc() instead of kzalloc(), but the iov_iter_get_pages() API
// gives back a byte count which makes it hard to detect initialization bugs
// related to the page pointers.
rs->pages = kzalloc(max_pages * sizeof *rs->pages, GFP_NOFS);
if (! rs->pages)
return;
rs->bvecs = kmalloc(max_pages * sizeof *rs->bvecs, GFP_NOFS);
if (! rs->bvecs)
return;
{
struct bio_vec *const bvecs = rs->bvecs;
struct page **const pages = rs->pages;
long unsigned start;
ssize_t gpr;
size_t view_size = 0;
#ifdef KERNEL_HAS_IOV_ITER_GET_PAGES2
struct iov_iter copyIter = *iter; //Copying the iterator because iov_iter_get_pages2()
//also performs the auto-advance of the iterator and
//we don't want auto-advancement because in the end
//of the while loop of the FhgfsOpsRemoting_readfileVec()
//doing the same thing.
gpr = iov_iter_get_pages2(&copyIter, pages, size, max_pages, &start);
#else
gpr = iov_iter_get_pages(iter, pages, size, max_pages, &start);
#endif
if (gpr < 0)
{
// indicate error?
// probably not necessary. The sanitized_iter field will be initialized with count 0.
}
else if (gpr > 0)
{
size_t bvs_size = 0;
size_t np = 0;
view_size = gpr;
for (np = 0; bvs_size < view_size; np++)
{
long unsigned offset = start;
long unsigned len = min_t(size_t, view_size - bvs_size, PAGE_SIZE - start);
BUG_ON(np >= max_pages);
BUG_ON(! pages[np]);
bvs_size += len;
start = 0;
bvecs[np] = (struct bio_vec) {
.bv_page = pages[np],
.bv_offset = offset,
.bv_len = len,
};
}
// make sure we're using all the pages that iov_iter_get_pages() gave us.
//BUG_ON(np < max_pages && pages[np]);
WARN_ON(np < max_pages && pages[np]);
rs->npages = np;
}
BEEGFS_IOV_ITER_BVEC(&rs->sanitized_iter, READ, bvecs, rs->npages, view_size);
}
}
#endif
void beegfs_readsink_reserve(BeeGFS_ReadSink *rs, struct iov_iter *iter, size_t size)
{
#ifdef KERNEL_HAS_ITER_PIPE
if (iov_iter_type(iter) == ITER_PIPE)
beegfs_readsink_reserve_pipe(rs, iter, size);
else
beegfs_readsink_reserve_no_pipe(rs, iter, size);
#else
beegfs_readsink_reserve_no_pipe(rs, iter, size);
#endif
}
void beegfs_readsink_release(BeeGFS_ReadSink *rs)
{
int npages = rs->npages;
struct page **pages = rs->pages;
for (int i = 0; i < npages; i++)
{
put_page(pages[i]);
pages[i] = NULL; // avoid this write?
}
kfree(rs->pages);
kfree(rs->bvecs);
memset(rs, 0, sizeof *rs);
}

View File

@@ -0,0 +1,210 @@
/*
* compatibility for older kernels. this code is mostly taken from include/linux/uio.h,
* include/linuxfs/fs.h and associated .c files.
*
* the originals are licensed as:
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#pragma once
#include <linux/kernel.h>
#include <linux/uio.h>
#include <linux/version.h>
#include <linux/uaccess.h>
#include <linux/bvec.h>
#ifndef KERNEL_HAS_ITER_KVEC
#error ITER_KVEC is a required feature
#endif
#ifndef KERNEL_HAS_ITER_IS_IOVEC
#error iter_is_iovec() is a required feature
#endif
/*
* In kernels 3.15 to 6.3 there was iov_iter_iovec(), returning the first iovec
* in an iov_iter of type ITER_IOVEC.
* 6.4 removes and started using macro iter_iov_addr & iter_iov_len.
* Using those now and providing a shim for older kernels.
*/
#if !defined(KERNEL_HAS_ITER_IOV_ADDR)
#define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset)
#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset)
#endif
#ifndef KERNEL_HAS_IOV_ITER_INIT_DIR
#error We require kernels that have a "direction" parameter to iov_iter_init().
#endif
#ifndef KERNEL_HAS_IOV_ITER_TYPE
static inline int iov_iter_type(const struct iov_iter *i)
{
return i->type & ~(READ | WRITE);
}
#endif
#ifndef KERNEL_HAS_IOV_ITER_IS_PIPE
static inline bool iov_iter_is_pipe(struct iov_iter* iter)
{
#ifdef KERNEL_HAS_ITER_PIPE
return iov_iter_type(iter) == ITER_PIPE;
#else
return false;
#endif
}
#endif
static inline int beegfs_iov_iter_is_iovec(const struct iov_iter *iter)
{
return iov_iter_type(iter) == ITER_IOVEC;
}
// TODO: Now that ITER_KVEC is required across all kernels, is this function still needed?
static inline struct iov_iter *beegfs_get_iovec_iov_iter(struct iov_iter *iter)
{
BUG_ON(!beegfs_iov_iter_is_iovec(iter));
return iter;
}
static inline unsigned long beegfs_iov_iter_nr_segs(const struct iov_iter *iter)
{
return iter->nr_segs;
}
static inline void beegfs_iov_iter_clear(struct iov_iter *iter)
{
iter->count = 0;
}
#ifdef KERNEL_HAS_ITER_PIPE
static inline bool beegfs_is_pipe_iter(struct iov_iter * iter)
{
return iov_iter_type(iter) == ITER_PIPE;
}
#endif
#define BEEGFS_IOV_ITER_INIT iov_iter_init
static inline void BEEGFS_IOV_ITER_KVEC(struct iov_iter *iter, int direction,
const struct kvec* kvec, unsigned long nr_segs, size_t count)
{
#ifndef KERNEL_HAS_IOV_ITER_KVEC_NO_TYPE_FLAG_IN_DIRECTION
direction |= ITER_KVEC;
#endif
iov_iter_kvec(iter, direction, kvec, nr_segs, count);
}
static inline void BEEGFS_IOV_ITER_BVEC(struct iov_iter *iter, int direction,
const struct bio_vec* bvec, unsigned long nr_segs, size_t count)
{
#ifndef KERNEL_HAS_IOV_ITER_KVEC_NO_TYPE_FLAG_IN_DIRECTION
direction |= ITER_BVEC;
#endif
iov_iter_bvec(iter, direction, bvec, nr_segs, count);
}
/*
BeeGFS_ReadSink
We can't get parallel reads to work easily with ITER_PIPE. That type of iter
doesn't allow splitting up a region easily for parallel writing. The reason
is that the iov_iter_advance() implementation for ITER_PIPE modifies shared
state (the pipe_inode structure).
The BeeGFS_ReadSink structure allows to abstract from that concern by
converting to an ITER_BVEC iter where necessary.
Use is as follows:
1) Initialize the struct by zeroing out, or using a {0} initializer.
This allows the cleanup routine to work even if nothing was ever
allocated.
1) Call _reserve() to set up a view of the given size into a given iov_iter
struct. If the given iov_iter is not of type ITER_PIPE, it will be copied
straight to the "sanitized_iter" field. Otherwise (if it is an
ITER_PIPE), an ITER_BVEC iterator will be made by allocating pages from
the pipe and setting up a bio_vec for each page.
Note that this can fail in low memory situations. The size of the view
that was successfully allocated can be queried by calling
iov_iter_count() on the sanitized_iter field.
2) The sanitized_iter field should be used to read data. The field can be
used destructively. In particular it is safe to call iov_iter_advance()
on it in order to partition the view for multiple parallel reads.
3) When reads are done, probably, iov_iter_advance() should be called on
the iter that was given to _reserve().
4) Call _release() to give back the pages that were reserved in step 2).
If the struct was properly initialized in step 1), is safe to call
_release() even if _reserve() was never called. This is useful when cleaning
up state after an early exit.
5) Go back to 2) if necessary, to copy more data.
*/
typedef struct _BeeGFS_ReadSink BeeGFS_ReadSink;
struct _BeeGFS_ReadSink {
size_t npages; // Number of pages currently in use (get_page())
struct page **pages; // 0..npages
struct bio_vec *bvecs; // 0..npages
// output value
struct iov_iter sanitized_iter;
};
void beegfs_readsink_reserve(BeeGFS_ReadSink *rs, struct iov_iter *iter, size_t size);
void beegfs_readsink_release(BeeGFS_ReadSink *rs);
/*
We have lots of code locations where we need to read or write memory using a
pointer + length pair, but need to use an iov_iter based API. This always
leads to boilerplate where struct iovec and struct iov_iter values have to be
declared on the stack. The following hack is meant to reduce that boilerplate.
*/
#define STACK_ALLOC_BEEGFS_ITER_IOV(ptr, size, direction) \
___BEEGFS_IOV_ITER_INIT(&(struct iov_iter){0}, &(struct iovec){0}, (ptr), (size), (direction))
#define STACK_ALLOC_BEEGFS_ITER_KVEC(ptr, size, direction) \
___BEEGFS_IOV_ITER_KVEC(&(struct iov_iter){0}, &(struct kvec){0}, (ptr), (size), (direction))
static inline struct iov_iter *___BEEGFS_IOV_ITER_INIT(
struct iov_iter *iter, struct iovec *iovec,
const char __user *ptr, size_t size, int direction)
{
unsigned nr_segs = 1;
*iovec = (struct iovec) {
.iov_base = (char __user *) ptr,
.iov_len = size,
};
BEEGFS_IOV_ITER_INIT(iter, direction, iovec, nr_segs, size);
return iter;
}
static inline struct iov_iter *___BEEGFS_IOV_ITER_KVEC(
struct iov_iter *iter, struct kvec* kvec,
const char *ptr, size_t size, int direction)
{
unsigned nr_segs = 1;
*kvec = (struct kvec) {
.iov_base = (char *) ptr,
.iov_len = size,
};
BEEGFS_IOV_ITER_KVEC(iter, direction, kvec, nr_segs, size);
return iter;
}